Extracting text from PDF after uploading (Django app on AWS)

@login_required(login_url="/login/")
def upload_view(request):
    posts = Temp.objects.all()
    common_tags = Temp.tags.most_common()

    if request.method == "POST":
        form = TempForm(request.POST, request.FILES, initial={"user": request.user})
        if form.is_valid():
            newpost = form.save(commit=False)
            newpost.slug = slugify(newpost.title)
            unique_slug = newpost.slug
            num = 1
            while Temp.objects.filter(slug=unique_slug).exists():
                unique_slug = "{}-{}".format(newpost.slug, num)
                num += 1
            newpost.slug = unique_slug
            newpost.user = request.user

            # Process the document before saving
            extracted_text = process_document(
                request.FILES["file_upload"], newpost.language
            )

            # Save the post and the extracted text
            newpost.ocr_text = extracted_text

            newpost.save()  # Save the file first
            form.save_m2m()

            notify_users_about_new_post(form)

            messages.success(request, "Your Post has been uploaded successfully.")
        else:
            messages.warning(request, "Upload unsuccessful. Please try again.")
    else:
        form = TempForm()

    context = {
        "posts": posts,
        "common_tags": common_tags,
        "form": form,
    }
    return render(request, "upload_file.html", context)

Currently, users can upload files and it will be stored in S3. I want to extract the content from the file and store it in the mySQL database. I tried different ways and they all failed.

  1. I used the following code. It works when I am testing it locally, but fail to download in Production.
urllib.request.urlretrieve(
   [link to S3 storage, where the document can be viewed publicly],
    "temp.pdf",
)
  1. I used boto3. It works when I am testing it locally, but fail to download in Production.
import boto3

# Set environment variables
os.environ["AWS_ACCESS_KEY_ID"] = [KEY]
os.environ["AWS_SECRET_ACCESS_KEY"] = [KEY]
os.environ["AWS_DEFAULT_REGION"] = [REGION]


def download_file_from_s3(bucket_name, object_key, file_name):
    s3 = boto3.client("s3")
    try:
        s3.download_file(bucket_name, object_key, file_name)
        print(f"File {file_name} downloaded from S3 bucket {bucket_name}.")
    except Exception as e:
        print(f"Error downloading file: {e}")

  1. I used the code above, but obtained Server 500 error. I believe the error is in:
extracted_text = process_document(
                request.FILES["file_upload"], newpost.language
            )

Any suggestions on what might be the problem? I think the error might be due to two reasons:

  1. I cannot download anything in EC2 instance.
  2. The file might be downloaded, but not within the same director level as the view.py file, causing the relative link passed to be faulty.
Back to Top