見出し画像

【AWS Glue】S3ファイル操作覚書

①フォルダ配下のファイルを削除する(Glue ETL)

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
glueContext.purge_s3_path("s3://s3-bucket-name/temp/", options={"retentionPeriod":0}, transformation_ctx="")
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job.commit()
  • purge_s3_path

    • 指定された Amazon S3 パスからファイルを再帰的に削除します

  • tempフォルダは残る

②ファイルリネーム(Python shell)

※実際はファイル名を指定してコピーしている

import sys
import boto3
import re

source_bucket = 's3-source-bucket'
source_prefix = 'original/'
target_bucket = 's3-target-bucket'
target_prefix = 'target/'
target_file = 'target_file.csv'

def copy_all_keys_v2(source_bucket='', source_prefix='', target_bucket='', target_prefix=''):

  contents_count = 0
  next_token = ''

  while True:
    if next_token == '':
      response = s3client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix)
    else:
      response = s3client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix, ContinuationToken=next_token)

    if 'Contents' in response:
      contents = response['Contents']
      contents_count = contents_count + len(contents)
      for content in contents:
        relative_prefix = re.sub('^' + source_prefix, '', content['Key'])
        print('Copying: s3://' + source_bucket + '/' + content['Key'] + ' To s3://' + target_bucket + '/' + target_prefix + target_file)
        s3client.copy_object(Bucket=target_bucket, Key=target_prefix + target_file, CopySource={'Bucket': source_bucket, 'Key': content['Key']})

    if 'NextContinuationToken' in response:
      next_token = response['NextContinuationToken']
    else:
      break

  print(contents_count)

if __name__ == "__main__":
    s3client = boto3.client('s3')
    copy_all_keys_v2(source_bucket, source_prefix, target_bucket, target_prefix)

いいなと思ったら応援しよう!