Django improve performance in dumping data
I have a main model called MainModel
and n models ModelA
, ModelB
... related with MainModel
with a ForeingKey
.
I want to export n csv each one made from 10k MainModels
.
This is the code:
import csv
import io
import boto3
from django.core.management.base import BaseCommand
from django.conf import settings
import datetime
import time
MODELS = [
'ModelA',
'ModelB',
'ModelC',
'ModelD',
'ModelE',
'ModelF',
'ModelG',
'ModelH',
'ModelI',
'ModelJ',
'ModelK',
'ModelL',
'ModelM',
'ModelN',
'ModelO',
'ModelP',
'ModelQ',
'ModelR',
'ModelS',
'ModelT',
'ModelU',
]
CHUNK_SIZE = 10000
class Command(BaseCommand):
help = 'Dump data of all models related to a given club_id into multiple CSV files on S3'
def add_arguments(self, parser):
parser.add_argument('club_id', type=int, help='The club_id for filtering the data')
parser.add_argument('operation_id', type=str, help='The rsync operation ID to structure the folder on S3')
parser.add_argument('--output', type=str, default='output.csv', help='Base output file name (default: output.csv)')
def handle(self, *args, **kwargs):
club_id = kwargs['club_id']
operation_id = kwargs['operation_id']
output_file = kwargs['output']
# Retrieve models using the name in MODELS
models = [get_model_from_name(model_name) for model_name in MODELS]
s3 = boto3.client('s3')
bucket_name = 'your-s3-bucket-name'
server = settings.MY_ENVIRONMENT if settings.MY_ENVIRONMENT else "default_env"
folder_name = f"{server}-{operation_id}/data"
mainmodel = MainModel.objects.filter(club_id=club_id)
total_sessions = mainmodel.count()
all_fields = sorted(sorted(set(field.name for model in models for field in model._meta.fields)) + ['model'])
for start in range(0, total_sessions, CHUNK_SIZE):
time_start = time.time()
self.stdout.write(f"Processing records from {start} to {start + CHUNK_SIZE}")
csv_buffer = io.StringIO()
writer = csv.writer(csv_buffer)
# Write header row with all field names
writer.writerow(all_fields)
chunk_mainmodels = mainmodel[start:start + CHUNK_SIZE]
for mm in chunk_mainmodels:
for model in models:
for record in model.objects.filter(mainmodel=mm):
row = []
for field in all_fields:
try:
value = getattr(record, field)
if field == 'athletesession':
value = mm.pk
elif field in ['created_on', 'updated_on'] and isinstance(value, (datetime.datetime, datetime.date)):
value = value.isoformat()
elif field == "model":
value = model._meta.model_name
row.append(value)
except AttributeError:
row.append(None)
writer.writerow(row)
chunk_file_name = f"{output_file.replace('.csv', '')}_part_{start // CHUNK_SIZE + 1}.csv"
s3_key = f"{folder_name}/{chunk_file_name}"
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())
progress_percentage = ((start + len(chunk_mainmodels)) / total_sessions) * 100
self.stdout.write(self.style.SUCCESS(f'Chunk {start // CHUNK_SIZE + 1} completed. Progress: {progress_percentage:.2f}% - Time elapsed: {time.time() - time_start:.2f}s'))
self.stdout.write(self.style.SUCCESS('All data successfully dumped in chunks to S3'))
It takes ~7 minutes to save 10k on records and upload them on s3. Is there a way to optimize the process in order to reduce timing?