diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3d92c8a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +media/ +static/ +documentation/ +.env +.venv/ +.vscode/ +.git/ +.gitignore +.woodpecker/ +**/__pycache__/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 425210d..ba0ca07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/ RUN chmod +x /app/scripts/start.sh # Install packages -RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev +RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt # Expose port 8000 for the web server diff --git a/docmanager_backend/documents/apps.py b/docmanager_backend/documents/apps.py index 37ce729..e4a5400 100644 --- a/docmanager_backend/documents/apps.py +++ b/docmanager_backend/documents/apps.py @@ -4,3 +4,7 @@ from django.apps import AppConfig class DocumentsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "documents" + + def ready(self) -> None: + import documents.signals + return super().ready() diff --git a/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py b/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py new file mode 100644 index 0000000..fb63337 --- /dev/null +++ b/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py @@ -0,0 +1,25 @@ +# Generated by Django 5.1.3 on 2024-11-24 05:17 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="document", + name="metadata", + field=models.TextField(null=True), + ), + migrations.AlterField( + model_name="document", + name="document_type", + field=models.CharField( + choices=[("memorandum", "Memorandum"), ("hoa", "HOA")], max_length=32 + ), + ), + ] diff --git a/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py b/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py new file mode 100644 index 0000000..42db5c8 --- /dev/null +++ b/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py @@ -0,0 +1,22 @@ +# Generated by Django 5.1.3 on 2024-11-24 06:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "0002_document_metadata_alter_document_document_type"), + ] + + operations = [ + migrations.RemoveField( + model_name="document", + name="metadata", + ), + migrations.AddField( + model_name="document", + name="ocr_metadata", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index be7d83d..d7c750b 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -1,3 +1,4 @@ + from django.db import models from django.utils.timezone import now import uuid @@ -7,17 +8,15 @@ class Document(models.Model): name = models.CharField(max_length=100) DOCUMENT_TYPE_CHOICES = ( - ("pdf", "PDF"), - ("image", "Image"), - ("video", "Video"), - ("doc", "Word Document"), - ("excel", "Excel Document"), - ("ppt", "Powerpoint Document"), + ("memorandum", "Memorandum"), + ("hoa", "HOA"), + # TODO: Update this list on types of documents ) document_type = models.CharField( max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False ) number_pages = models.IntegerField(null=False, blank=False) + ocr_metadata = models.TextField(null=True, blank=True) def upload_to(instance, filename): _, extension = filename.split(".") diff --git a/docmanager_backend/documents/serializers.py b/docmanager_backend/documents/serializers.py index 80a3594..5fc2247 100644 --- a/docmanager_backend/documents/serializers.py +++ b/docmanager_backend/documents/serializers.py @@ -36,12 +36,13 @@ class DocumentSerializer(serializers.ModelSerializer): class Meta: model = Document fields = ["id", "name", "document_type", - "number_pages", "date_uploaded"] + "number_pages", "ocr_metadata", "date_uploaded"] read_only_fields = [ "id", "name", "document_type", "number_pages", + "ocr_metadata", "date_uploaded", ] @@ -59,15 +60,17 @@ class DocumentFileSerializer(serializers.ModelSerializer): "id", "name", "document_type", - "file", "number_pages", + "ocr_metadata", "date_uploaded", + "file", ] read_only_fields = [ "id", "name", "document_type", "number_pages", + "ocr_metadata", "date_uploaded", "file", ] diff --git a/docmanager_backend/documents/signals.py b/docmanager_backend/documents/signals.py new file mode 100644 index 0000000..0b7621c --- /dev/null +++ b/docmanager_backend/documents/signals.py @@ -0,0 +1,26 @@ +from django.db.models.signals import post_save +from django.dispatch import receiver +from config.settings import MEDIA_ROOT +import os +import fitz +import pytesseract +from PIL import Image +from .models import Document + + +@receiver(post_save, sender=Document) +def domain_post_save(sender, instance, **kwargs): + if not instance.ocr_metadata: + metadata = "" + with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: + mat = fitz.Matrix(1.2, 1.2) + for page in doc: + pix = page.get_pixmap(matrix=mat) + output = f'{page.number}.jpg' + pix.save(output) + res = str(pytesseract.image_to_string(Image.open(output))) + os.remove(output) + metadata += res + + instance.ocr_metadata = metadata + instance.save() diff --git a/documentation/erd/app_models.png b/documentation/erd/app_models.png index a83f08e..1680f03 100644 Binary files a/documentation/erd/app_models.png and b/documentation/erd/app_models.png differ diff --git a/requirements.txt b/requirements.txt index 680deb8..eb57c31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ defusedxml==0.8.0rc2 Django==5.1.3 django-cleanup==9.0.0 django-cors-headers==4.6.0 +django-extensions==3.2.3 django-rest-framework==0.1.0 django-unfold==0.41.0 djangorestframework==3.15.2 @@ -18,18 +19,28 @@ djangorestframework-simplejwt==5.3.1 djoser==2.3.1 drf-spectacular==0.27.2 drf-spectacular-sidecar==2024.11.1 +filelock==3.16.1 +fsspec==2024.10.0 gunicorn==23.0.0 idna==3.10 inflection==0.5.1 +Jinja2==3.1.4 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 +MarkupSafe==2.1.5 +mpmath==1.3.0 mypy-extensions==1.0.0 +networkx==3.4.2 oauthlib==3.2.2 packaging==24.2 pathspec==0.12.1 +pillow==11.0.0 platformdirs==4.3.6 pycparser==2.22 +pyflakes==3.2.0 PyJWT==2.10.0 +PyMuPDF==1.24.14 +pytesseract==0.3.13 python-dotenv==1.0.1 python3-openid==3.2.0 PyYAML==6.0.2 @@ -37,12 +48,14 @@ referencing==0.35.1 requests==2.32.3 requests-oauthlib==2.0.0 rpds-py==0.21.0 +setuptools==70.2.0 social-auth-app-django==5.4.2 social-auth-core==4.5.4 sqlparse==0.5.2 +sympy==1.13.1 +typing_extensions==4.12.2 tzdata==2024.2 uritemplate==4.1.1 urllib3==2.2.3 whitenoise==6.8.2 -django-extensions==3.2.3 pygraphviz==1.14; platform_system == 'Linux' \ No newline at end of file