diff --git a/README.md b/README.md index baffe897..7b8f77de 100644 --- a/README.md +++ b/README.md @@ -56,10 +56,7 @@ You must first have these set up and ready to go: ``` 12. Go to [Django admin for DocumentCloud](https://api.dev.documentcloud.org/admin) and add the required static [flat page](https://api.dev.documentcloud.org/admin/flatpages/flatpage/) called `/tipofday/`. It can be blank. Do not prefix the URL with `/pages/`. Specifying the `Site` as `example.com` is alright. 13. Create an initial Minio bucket to simulate AWS S3 locally: - - Reference your DocumentCloud `.django` file for these variables: - - Visit the `MINIO_URL` with a browser, likely at [this address](http://minio.documentcloud.org:9000), and login with the minio `MINIO_ACCESS_KEY` and `MINIO_SECRET_KEY` - - At the bottom right corner click the round plus button and then click the first circle that appears above it to "create bucket". - - Create a bucket called `documents` + - Run `inv initialize-minio` 14. Upload a document: - **Check your memory allocation on Docker is at least 7gb.** A sign that you do not have enough memory allocated is if containers are randomly failing or if your system is swapping heavily, especially when uploading documents. - The "upload" button should not be grayed out (if it is, check your user organization Verified Journalist status above) diff --git a/config/settings/base.py b/config/settings/base.py index f7eb4a0b..152e36d8 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -463,7 +463,7 @@ BASE_URL = DOCCLOUD_URL PUBLIC_ASSET_URL = env( - "PUBLIC_ASSET_URL", default="http://minio.documentcloud.org:9000/documents/" + "PUBLIC_ASSET_URL", default="https://minio.documentcloud.org/documents/" ) PRIVATE_ASSET_URL = env("PRIVATE_ASSET_URL", default=f"{DOCCLOUD_API_URL}/files/") diff --git a/documentcloud/common/environment/aws/storage.py b/documentcloud/common/environment/aws/storage.py index 1a480b02..11c7d7ab 100644 --- a/documentcloud/common/environment/aws/storage.py +++ b/documentcloud/common/environment/aws/storage.py @@ -58,23 +58,28 @@ def size(self, file_name): return bucket.Object(key).content_length def open(self, file_name, mode="rb", content_type=None, access=None): - + # This logic changed with smart_open 5.0 + # https://github.com/piskvorky/smart_open/blob/develop/CHANGELOG.md#500-30-mar-2021 + # See migration guide here: + # https://github.com/piskvorky/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst transport_params = { - "resource_kwargs": self.resource_kwargs, - "multipart_upload_kwargs": {}, + "client": self.s3_client, } - - if content_type is None: - # attempt to guess content type if not specified - content_type = mimetypes.guess_type(file_name)[0] - - if content_type is not None: - # set content type if we have one - transport_params["multipart_upload_kwargs"]["ContentType"] = content_type - - if access is not None: - transport_params["multipart_upload_kwargs"]["ACL"] = ACLS[access] - + if "w" in mode: # Setting these kwargs only make sense in a write context + writeable_kwargs = {} + if content_type is None: + # attempt to guess content type if not specified + content_type = mimetypes.guess_type(file_name)[0] + if content_type is not None: + # set content type if we have one + writeable_kwargs["ContentType"] = content_type + if access is not None: + writeable_kwargs["ACL"] = ACLS[access] + if writeable_kwargs: + # Guard against no writeable kwargs provided + transport_params["client_kwargs"] = { + "S3.Client.create_multipart_upload": writeable_kwargs + } return smart_open.open( f"s3://{file_name}", mode, transport_params=transport_params ) diff --git a/documentcloud/common/environment/minio/storage.py b/documentcloud/common/environment/minio/storage.py index a2cc3c18..25781c67 100644 --- a/documentcloud/common/environment/minio/storage.py +++ b/documentcloud/common/environment/minio/storage.py @@ -13,8 +13,8 @@ def __init__(self, resource_kwargs=None, minio=True): if resource_kwargs is None: resource_kwargs = { "endpoint_url": env.str("MINIO_URL"), - "aws_access_key_id": env.str("MINIO_ACCESS_KEY"), - "aws_secret_access_key": env.str("MINIO_SECRET_KEY"), + "aws_access_key_id": env.str("MINIO_ROOT_USER"), + "aws_secret_access_key": env.str("MINIO_ROOT_PASSWORD"), "config": Config(signature_version="s3v4"), "region_name": "us-east-1", } diff --git a/documentcloud/core/management/commands/initialize_minio.py b/documentcloud/core/management/commands/initialize_minio.py new file mode 100644 index 00000000..40c510fb --- /dev/null +++ b/documentcloud/core/management/commands/initialize_minio.py @@ -0,0 +1,57 @@ +# Django +from django.core.management.base import BaseCommand + +# Standard Library +import json + +# Third Party +import boto3 +import environ +from botocore.client import Config +from botocore.exceptions import ClientError + +env = environ.Env() + + +class Command(BaseCommand): + help = "Initialize Minio bucket and policies for local development" + + def handle(self, *args, **options): + if env.str("ENVIRONMENT") != "local-minio": + return + + client = boto3.client( + "s3", + endpoint_url=env.str("MINIO_URL"), + aws_access_key_id=env.str("MINIO_ROOT_USER"), + aws_secret_access_key=env.str("MINIO_ROOT_PASSWORD"), + config=Config(signature_version="s3v4"), + region_name="us-east-1", + ) + + # Create bucket if it doesn't exist + try: + client.head_bucket(Bucket="documents") + self.stdout.write("Bucket already exists") + except ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code == "404": # Bucket doesn't exist, create it + client.create_bucket(Bucket="documents") + self.stdout.write("Created documents bucket") + else: + raise + + # Set public read policy + policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": "*", + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::documents/*", + } + ], + } + client.put_bucket_policy(Bucket="documents", Policy=json.dumps(policy)) + self.stdout.write("Minio initialized successfully") diff --git a/documentcloud/documents/processing/info_and_image/main.py b/documentcloud/documents/processing/info_and_image/main.py index 7c198246..6290b426 100755 --- a/documentcloud/documents/processing/info_and_image/main.py +++ b/documentcloud/documents/processing/info_and_image/main.py @@ -768,7 +768,7 @@ def extract_single_page(doc_id, slug, access, page, page_number, large_image_pat image_width, max(round(img_buffer.height * (image_width / img_buffer.width)), 1), ), - Image.ANTIALIAS, + Image.LANCZOS, ) mem_file = io.BytesIO() diff --git a/initialize_dotenvs.py b/initialize_dotenvs.py index 829d0316..24547538 100755 --- a/initialize_dotenvs.py +++ b/initialize_dotenvs.py @@ -70,9 +70,10 @@ def random_string(n): { "name": "MinIO", "envvars": [ - ("MINIO_ACCESS_KEY", lambda: random_string(64)), - ("MINIO_SECRET_KEY", lambda: random_string(64)), - ("MINIO_URL", "http://minio.documentcloud.org:9000"), + ("MINIO_ROOT_USER", lambda: random_string(64)), + ("MINIO_ROOT_PASSWORD", lambda: random_string(64)), + ("MINIO_URL", "https://minio.documentcloud.org"), + ("AWS_CA_BUNDLE", "/etc/ssl/certs/ca-certificates.crt"), ], }, ], diff --git a/local.yml b/local.yml index 1e9a049f..05387510 100644 --- a/local.yml +++ b/local.yml @@ -50,16 +50,18 @@ services: image: redis:5.0 documentcloud_minio: - image: minio/minio:RELEASE.2019-10-12T01-39-57Z + image: minio/minio:RELEASE.2024-12-18T13-15-44Z volumes: - local_minio_data:/data ports: - "9000:9000" - command: server /data + - "9001:9001" + command: server /data --console-address ":9001" env_file: - ./.envs/.local/.django networks: default: + squarelet_default: aliases: - minio.documentcloud.org diff --git a/tasks.py b/tasks.py index e989c12e..bcc2fc9a 100755 --- a/tasks.py +++ b/tasks.py @@ -233,6 +233,10 @@ def download_tesseract_data(c): """Download Tesseract data files. Needed to be able to do OCR locally.""" c.run("cd config/aws/lambda; ./build.sh") +@task +def initialize_minio(c): + """Initialize Minio bucket and policies for local development""" + c.run(DJANGO_RUN.format(cmd="python manage.py initialize_minio")) @task def deploy_lambdas(c, staging=False):