As part of our company consolidation, we have to migrate some large amount of data between different AWS accounts. We looked into using simple CLI and Batch Replication, but they have their limits; mainly some of our files are greater than 5GB.
After talking to AWS support, we ended up going with AWS DataSync. We first set it up via AWS console, then CLI. Lastly via Terraform.
- CLI Settings
- Source Bucket
- Destination Bucket
- CloudWatch
- IAM Role
- IAM Policy
- Bucket Policy
- DataSync Locations
- DataSync Task
- Start the Task
- Task Status
- Verify Data
- References
CLI Settings
Just do a search/replace for the following:
Name | Value | Region | Required |
---|---|---|---|
source s3 bucket | example-source-bucket-us-east-1 | us-east-1 | yes |
dest s3 bucket | example-destination-bucket-us-west-2 | us-west-2 | yes |
dest account | 123456789012 | global | yes |
Terraform Settings
variable "environment" {
default = "qa"
}
variable "region" {
default = "us-west-2"
}
provider "aws" {
alias = "virginia"
shared_credentials_file = pathexpand("~/.aws/credentials")
profile = "qa-profile"
region = "us-east-1"
}
provider "aws" {
alias = "oregon"
shared_credentials_file = pathexpand("~/.aws/credentials")
profile = "qa-profile"
region = "us-west-2"
}
Source Bucket
- Account: source
$ aws s3 ls s3://example-source-bucket-us-east-1/ --recursive --summarize --human-readable
...
Total Objects: 41
Total Size: 222.4 GiB
Destination Bucket
- Account: dest
$ aws s3 ls s3://example-destination-bucket-us-west-2/ --recursive --summarize --human-readable
...
Total Objects: 200
Total Size: 72.8 GiB
CloudWatch
We need to setup CloudWatch logs for DataSync.
Check the log publishing policy and create if it doesn’t exist, then create the log-group.
Notice! The log group and policy are created in the same region as the source S3 bucket.
$ aws logs describe-resource-policies --region us-east-1
{
"resourcePolicies": []
}
$ cat > datasync-log-publishing-policy.json <<EOF
{
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents",
"logs:CreateLogStream"
],
"Principal": {
"Service": "datasync.amazonaws.com"
},
"Resource": "*"
}
],
"Version": "2012-10-17"
}
EOF
$ aws logs put-resource-policy \
--region us-east-1 \
--policy-name datasync-log-publishing-policy \
--policy-document file://datasync-log-publishing-policy.json
$ aws logs create-log-group --region us-east-1 --log-group-name /aws/datasync
Terraform CloudWatch
module "cloudwatch_virginia_datasync" {
source = "terraform-aws-modules/cloudwatch/aws//modules/log-group"
version = "2.5.0"
providers = {
aws = aws.virginia
}
name = "/aws/datasync/${var.environment}"
retention_in_days = 120
tags = {
name = "/aws/datasync"
environment = var.environment
infra = "terraform"
resource = "cloudwatch"
}
}
data "aws_iam_policy_document" "datasync_log_publishing_policy_virginia" {
statement {
principals {
type = "Service"
identifiers = ["datasync.amazonaws.com"]
}
actions = [
"logs:CreateLogStream",
"logs:PutLogEvents",
]
resources = ["arn:aws:logs:*"]
}
}
resource "aws_cloudwatch_log_resource_policy" "datasync_log_publishing_policy_virginia" {
provider = aws.virginia
policy_name = "${var.environment}-datasync-log-publishing-policy"
policy_document = data.aws_iam_policy_document.datasync_log_publishing_policy_virginia.json
}
IAM Role
- Account: dest
arn:aws:iam::123456789012:role/datasync-role-between-accounts
$ cat > datasync-role-trust-policy.json <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "datasync.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
$ aws iam create-role \
--role-name datasync-role-between-accounts \
--assume-role-policy-document file://datasync-role-trust-policy.json \
--tags Key=infra,Value=cli
Terraform IAM Role
module "iam_assumable_role_datasync_between_accounts" {
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role"
version = "~> 4.3"
trusted_role_services = [
"datasync.amazonaws.com"
]
create_role = true
role_name = "${var.environment}-datasync-between-accounts"
role_description = "Role for DataSync between accounts"
role_requires_mfa = false
custom_role_policy_arns = [
module.iam_policy_datasync_between_accounts.arn,
]
number_of_custom_role_policy_arns = 1
tags = {
name = "datasync-between-accounts"
environment = var.environment
infra = "terraform"
resource = "iam"
}
}
IAM Policy
- Account: dest
arn:aws:iam::123456789012:policy/datasync-policy-between-accounts
$ cat > datasync-role-permissions-policy.json <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListBucketMultipartUploads"
],
"Effect": "Allow",
"Resource": [
"arn:aws:s3:::example-source-bucket-us-east-1",
"arn:aws:s3:::example-destination-bucket-us-west-2"
]
},
{
"Action": [
"s3:AbortMultipartUpload",
"s3:DeleteObject",
"s3:GetObject",
"s3:ListMultipartUploadParts",
"s3:PutObjectTagging",
"s3:GetObjectTagging",
"s3:PutObject"
],
"Effect": "Allow",
"Resource": [
"arn:aws:s3:::example-source-bucket-us-east-1/*",
"arn:aws:s3:::example-destination-bucket-us-west-2/*"
]
}
]
}
EOF
$ aws iam put-role-policy \
--role-name datasync-role-between-accounts \
--policy-document file://datasync-role-permissions-policy.json \
--policy-name datasync-policy-between-accounts
Terraform IAM Policy
module "iam_policy_datasync_between_accounts" {
source = "terraform-aws-modules/iam/aws//modules/iam-policy"
version = "~> 4.3"
name = "${var.environment}-datasync-between-accounts"
path = "/"
description = "Managed by Terraform"
policy = data.aws_iam_policy_document.datasync_between_accounts.json
tags = {
name = "datasync-between-accounts"
environment = var.environment
infra = "terraform"
resource = "iam"
}
}
data "aws_iam_policy_document" "datasync_between_accounts" {
statement {
actions = [
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
]
resources = [
"arn:aws:s3:::example-source-bucket-us-east-1",
"arn:aws:s3:::example-destination-bucket-us-west-2",
]
}
statement {
actions = [
"s3:AbortMultipartUpload",
"s3:DeleteObject",
"s3:GetObject",
"s3:ListMultipartUploadParts",
"s3:PutObjectTagging",
"s3:GetObjectTagging",
"s3:PutObject",
]
resources = [
"arn:aws:s3:::example-source-bucket-us-east-1/*",
"arn:aws:s3:::example-destination-bucket-us-west-2/*",
]
}
}
Bucket Policy
- Account: source
This we apply on the source account. We do not have this in Terraform. The user role we execute the Start Task
is arn:aws:iam::123456789012:role/AdminUserRole
.
$ cat > bucket-policy.json <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "BucketPolicyForDataSync",
"Effect": "Allow",
"Principal": {
"AWS": [
"arn:aws:iam::123456789012:role/AdminUserRole",
"arn:aws:iam::123456789012:role/datasync-role-between-accounts"
]
},
"Action": [
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:AbortMultipartUpload",
"s3:DeleteObject",
"s3:GetObject",
"s3:ListMultipartUploadParts",
"s3:PutObject",
"s3:GetObjectTagging",
"s3:PutObjectTagging"
],
"Resource": [
"arn:aws:s3:::example-source-bucket-us-east-1",
"arn:aws:s3:::example-source-bucket-us-east-1/*"
]
}
]
}
EOF
$ aws s3api put-bucket-policy \
--bucket example-source-bucket-us-east-1 \
--policy file://bucket-policy.json
DataSync Locations
- Account: dest
- Region: us-east-1, us-west-2
$ aws datasync create-location-s3 \
--region us-east-1 \
--s3-bucket-arn arn:aws:s3:::example-source-bucket-us-east-1 \
--s3-config '{"BucketAccessRoleArn":"arn:aws:iam::123456789012:role/datasync-role-between-accounts"}' \
--subdirectory 'output/hls/072919ACHRISTINARRIDE7' \
--tags Key=infra,Value=cli
$ aws datasync create-location-s3 \
--region us-west-2 \
--s3-bucket-arn arn:aws:s3:::example-destination-bucket-us-west-2 \
--s3-config '{"BucketAccessRoleArn":"arn:aws:iam::123456789012:role/datasync-role-between-accounts"}' \
--subdirectory 'videos' \
--tags Key=infra,Value=cli
Terraform DataSync Locations
resource "aws_datasync_location_s3" "source_bucket_01" {
provider = aws.virginia
s3_bucket_arn = "arn:aws:s3:::example-source-bucket-us-east-1"
subdirectory = "/output/hls/072919ACHRISTINARRIDE7"
s3_config {
bucket_access_role_arn = module.iam_assumable_role_datasync_between_accounts.iam_role_arn
}
tags = {
name = "datasync-between-accounts"
environment = var.environment
infra = "terraform"
resource = "datasync"
}
}
resource "aws_datasync_location_s3" "destination_bucket_01" {
provider = aws.oregon
s3_bucket_arn = "arn:aws:s3:::example-destination-bucket-us-west-2"
subdirectory = "/videos"
s3_config {
bucket_access_role_arn = module.iam_assumable_role_datasync_between_accounts.iam_role_arn
}
tags = {
name = "datasync-between-accounts"
environment = var.environment
infra = "terraform"
resource = "datasync"
}
}
DataSync Task
- Account: dest
- Region: us-east-1
$ aws datasync create-task \
--region us-east-1 \
--name datasync-task-between-accounts \
--source-location-arn $(aws datasync list-locations \
--region us-east-1 | \
jq -r '.Locations[] | select(.LocationUri | contains("example-source-bucket-us-east-1")) | .LocationArn') \
--destination-location-arn $(aws datasync list-locations \
--region us-west-2 | \
jq -r '.Locations[] | select(.LocationUri | contains("example-destination-bucket-us-west-2")) | .LocationArn') \
--cloud-watch-log-group-arn arn:aws:logs:us-east-1:123456789012:log-group:/aws/datasync \
--options LogLevel=BASIC,OverwriteMode=NEVER \
--tags Key=infra,Value=cli
$ aws datasync update-task \
--region us-east-1 \
--task-arn arn:aws:datasync:us-east-1:123456789012:task/task-1a2s3d4f5g \
--includes FilterType=SIMPLE_PATTERN,Value='/input/081921A_DYAN_B-ENDR30.mp4|/input/032422F_JUSTIN_B-CLMB30.mp4|/input/032422B_DAVANNA_B-HIIT15.mp4'
Terraform DataSync Task
resource "aws_datasync_task" "datasync_between_bucket_01" {
provider = aws.virginia
name = "${var.environment}-datasync-between-accounts-bucket-01"
source_location_arn = aws_datasync_location_s3.source_bucket_01.arn
destination_location_arn = aws_datasync_location_s3.destination_bucket_01.arn
cloudwatch_log_group_arn = module.cloudwatch_virginia_datasync.cloudwatch_log_group_arn
options {
log_level = "BASIC"
overwrite_mode = "NEVER"
uid = "NONE"
gid = "NONE"
posix_permissions = "NONE"
}
excludes {
filter_type = "SIMPLE_PATTERN"
value = "/B-ENDR30.mp4|/032422F_JUSTIN_B-CLMB30.mp4|/032422B_DAVANNA_B-HIIT15.mp4"
}
tags = {
name = "datasync-between-accounts"
environment = var.environment
infra = "terraform"
resource = "datasync"
}
}
Start the Task
- Account: dest
- Region: us-east-1
$ aws datasync list-tasks --region us-east-1
{
"Tasks": [
{
"TaskArn": "arn:aws:datasync:us-east-1:123456789012:task/task-073d8d9057339cfba",
"Status": "AVAILABLE",
"Name": "datasync-task-between-accounts"
}
]
}
$ aws datasync start-task-execution \
--region us-east-1 \
--task-arn $(aws datasync list-tasks --region us-east-1 | jq -r '.Tasks[] | select(.Name == "datasync-task-between-accounts") | .TaskArn')
Task Status
- Account: dest
- Region: us-east-1
$ aws datasync describe-task-execution \
--region us-east-1 \
--task-execution-arn $(aws datasync list-task-executions \
--region us-east-1 | \
jq -r '.TaskExecutions[] | select(.Status == "TRANSFERRING") | .TaskExecutionArn')
{
"TaskExecutionArn": "arn:aws:datasync:us-east-1:123456789012:task/task-02efc22e5fe177f43/execution/exec-0d9c90a9495dccc79",
"Status": "TRANSFERRING",
"Options": {
"VerifyMode": "POINT_IN_TIME_CONSISTENT",
...
"LogLevel": "BASIC",
"TransferMode": "CHANGED",
"SecurityDescriptorCopyFlags": "NONE"
},
"Excludes": [],
"Includes": [],
"StartTime": "2022-03-26T20:48:47.066000-07:00",
...
"Result": {
"PrepareDuration": 9890,
"PrepareStatus": "PENDING",
"TotalDuration": 123477,
"TransferDuration": 118532,
"TransferStatus": "PENDING",
"VerifyDuration": 0,
"VerifyStatus": "PENDING"
}
}
Verify Data
- Account: dest
$ aws s3 ls s3://example-destination-bucket-us-west-2/videos/ --recursive --summarize --human-readable
...
Total Objects: 67
Total Size: 222.4 KiB
Clean Up
- delete datasync task
- delete datasync locations
- delete role