python s3fs

date: 2024-05-26 excerpt: python s3fsの使い方

python s3fsの使い方

概要

s3fsは、Amazon S3をファイルシステムとしてマウントするためのPythonライブラリ

インストール

$ pip install s3fs

s3上のログファイルをpandasで読み込む

import pandas as pd
import s3fs
import gzip
import json

# S3バケット名とプレフィックスを指定
bucket_name = 'your-bucket-name'
prefix = 'your/prefix/'

# S3ファイルシステムのインスタンスを作成
fs = s3fs.S3FileSystem()

# プレフィックスに一致するすべてのファイルを取得
try:
    files = fs.glob(f's3://{bucket_name}/{prefix}*')
    if not files:
        print("No files found. Please check your bucket and prefix.")
except Exception as e:
    print(f"Error occurred while listing files: {e}")

# データフレームを格納するリスト
dataframes = []

# 各ファイルを読み込み
for file in files:
    try:
        with fs.open(file, 'rb') as f:
            with gzip.open(f, 'rt', encoding='utf-8') as gz:
                dataframes.append(pd.read_json(gz, lines=True))
    except FileNotFoundError as e:
        print(f"File not found: {file}. Skipping.")
    except Exception as e:
        print(f"Error occurred while reading file {file}: {e}")

# データフレームを連結
df = pd.concat(dataframes, ignore_index=True)