diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 3d92c8a..0000000 --- a/.dockerignore +++ /dev/null @@ -1,10 +0,0 @@ -media/ -static/ -documentation/ -.env -.venv/ -.vscode/ -.git/ -.gitignore -.woodpecker/ -**/__pycache__/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index ba0ca07..425210d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/ RUN chmod +x /app/scripts/start.sh # Install packages -RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr +RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt # Expose port 8000 for the web server diff --git a/docmanager_backend/documents/apps.py b/docmanager_backend/documents/apps.py index e4a5400..37ce729 100644 --- a/docmanager_backend/documents/apps.py +++ b/docmanager_backend/documents/apps.py @@ -4,7 +4,3 @@ from django.apps import AppConfig class DocumentsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "documents" - - def ready(self) -> None: - import documents.signals - return super().ready() diff --git a/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py b/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py deleted file mode 100644 index fb63337..0000000 --- a/docmanager_backend/documents/migrations/0002_document_metadata_alter_document_document_type.py +++ /dev/null @@ -1,25 +0,0 @@ -# Generated by Django 5.1.3 on 2024-11-24 05:17 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("documents", "0001_initial"), - ] - - operations = [ - migrations.AddField( - model_name="document", - name="metadata", - field=models.TextField(null=True), - ), - migrations.AlterField( - model_name="document", - name="document_type", - field=models.CharField( - choices=[("memorandum", "Memorandum"), ("hoa", "HOA")], max_length=32 - ), - ), - ] diff --git a/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py b/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py deleted file mode 100644 index 42db5c8..0000000 --- a/docmanager_backend/documents/migrations/0003_remove_document_metadata_document_ocr_metadata.py +++ /dev/null @@ -1,22 +0,0 @@ -# Generated by Django 5.1.3 on 2024-11-24 06:04 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("documents", "0002_document_metadata_alter_document_document_type"), - ] - - operations = [ - migrations.RemoveField( - model_name="document", - name="metadata", - ), - migrations.AddField( - model_name="document", - name="ocr_metadata", - field=models.TextField(blank=True, null=True), - ), - ] diff --git a/docmanager_backend/documents/models.py b/docmanager_backend/documents/models.py index d7c750b..be7d83d 100644 --- a/docmanager_backend/documents/models.py +++ b/docmanager_backend/documents/models.py @@ -1,4 +1,3 @@ - from django.db import models from django.utils.timezone import now import uuid @@ -8,15 +7,17 @@ class Document(models.Model): name = models.CharField(max_length=100) DOCUMENT_TYPE_CHOICES = ( - ("memorandum", "Memorandum"), - ("hoa", "HOA"), - # TODO: Update this list on types of documents + ("pdf", "PDF"), + ("image", "Image"), + ("video", "Video"), + ("doc", "Word Document"), + ("excel", "Excel Document"), + ("ppt", "Powerpoint Document"), ) document_type = models.CharField( max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False ) number_pages = models.IntegerField(null=False, blank=False) - ocr_metadata = models.TextField(null=True, blank=True) def upload_to(instance, filename): _, extension = filename.split(".") diff --git a/docmanager_backend/documents/serializers.py b/docmanager_backend/documents/serializers.py index 5fc2247..80a3594 100644 --- a/docmanager_backend/documents/serializers.py +++ b/docmanager_backend/documents/serializers.py @@ -36,13 +36,12 @@ class DocumentSerializer(serializers.ModelSerializer): class Meta: model = Document fields = ["id", "name", "document_type", - "number_pages", "ocr_metadata", "date_uploaded"] + "number_pages", "date_uploaded"] read_only_fields = [ "id", "name", "document_type", "number_pages", - "ocr_metadata", "date_uploaded", ] @@ -60,17 +59,15 @@ class DocumentFileSerializer(serializers.ModelSerializer): "id", "name", "document_type", - "number_pages", - "ocr_metadata", - "date_uploaded", "file", + "number_pages", + "date_uploaded", ] read_only_fields = [ "id", "name", "document_type", "number_pages", - "ocr_metadata", "date_uploaded", "file", ] diff --git a/docmanager_backend/documents/signals.py b/docmanager_backend/documents/signals.py deleted file mode 100644 index 0b7621c..0000000 --- a/docmanager_backend/documents/signals.py +++ /dev/null @@ -1,26 +0,0 @@ -from django.db.models.signals import post_save -from django.dispatch import receiver -from config.settings import MEDIA_ROOT -import os -import fitz -import pytesseract -from PIL import Image -from .models import Document - - -@receiver(post_save, sender=Document) -def domain_post_save(sender, instance, **kwargs): - if not instance.ocr_metadata: - metadata = "" - with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc: - mat = fitz.Matrix(1.2, 1.2) - for page in doc: - pix = page.get_pixmap(matrix=mat) - output = f'{page.number}.jpg' - pix.save(output) - res = str(pytesseract.image_to_string(Image.open(output))) - os.remove(output) - metadata += res - - instance.ocr_metadata = metadata - instance.save() diff --git a/documentation/erd/app_models.png b/documentation/erd/app_models.png index 1680f03..a83f08e 100644 Binary files a/documentation/erd/app_models.png and b/documentation/erd/app_models.png differ diff --git a/requirements.txt b/requirements.txt index eb57c31..680deb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ defusedxml==0.8.0rc2 Django==5.1.3 django-cleanup==9.0.0 django-cors-headers==4.6.0 -django-extensions==3.2.3 django-rest-framework==0.1.0 django-unfold==0.41.0 djangorestframework==3.15.2 @@ -19,28 +18,18 @@ djangorestframework-simplejwt==5.3.1 djoser==2.3.1 drf-spectacular==0.27.2 drf-spectacular-sidecar==2024.11.1 -filelock==3.16.1 -fsspec==2024.10.0 gunicorn==23.0.0 idna==3.10 inflection==0.5.1 -Jinja2==3.1.4 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 -MarkupSafe==2.1.5 -mpmath==1.3.0 mypy-extensions==1.0.0 -networkx==3.4.2 oauthlib==3.2.2 packaging==24.2 pathspec==0.12.1 -pillow==11.0.0 platformdirs==4.3.6 pycparser==2.22 -pyflakes==3.2.0 PyJWT==2.10.0 -PyMuPDF==1.24.14 -pytesseract==0.3.13 python-dotenv==1.0.1 python3-openid==3.2.0 PyYAML==6.0.2 @@ -48,14 +37,12 @@ referencing==0.35.1 requests==2.32.3 requests-oauthlib==2.0.0 rpds-py==0.21.0 -setuptools==70.2.0 social-auth-app-django==5.4.2 social-auth-core==4.5.4 sqlparse==0.5.2 -sympy==1.13.1 -typing_extensions==4.12.2 tzdata==2024.2 uritemplate==4.1.1 urllib3==2.2.3 whitenoise==6.8.2 +django-extensions==3.2.3 pygraphviz==1.14; platform_system == 'Linux' \ No newline at end of file