Compare commits

...

2 commits

10 changed files with 112 additions and 10 deletions

10
.dockerignore Normal file
View file

@ -0,0 +1,10 @@
media/
static/
documentation/
.env
.venv/
.vscode/
.git/
.gitignore
.woodpecker/
**/__pycache__/

View file

@ -10,7 +10,7 @@ COPY scripts/ /app/scripts/
RUN chmod +x /app/scripts/start.sh
# Install packages
RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev
RUN apt update && apt install -y graphviz libgraphviz-dev graphviz-dev tesseract-ocr
RUN pip3 install --upgrade pip && pip3 install --no-cache-dir -r requirements.txt
# Expose port 8000 for the web server

View file

@ -4,3 +4,7 @@ from django.apps import AppConfig
class DocumentsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "documents"
def ready(self) -> None:
import documents.signals
return super().ready()

View file

@ -0,0 +1,25 @@
# Generated by Django 5.1.3 on 2024-11-24 05:17
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="document",
name="metadata",
field=models.TextField(null=True),
),
migrations.AlterField(
model_name="document",
name="document_type",
field=models.CharField(
choices=[("memorandum", "Memorandum"), ("hoa", "HOA")], max_length=32
),
),
]

View file

@ -0,0 +1,22 @@
# Generated by Django 5.1.3 on 2024-11-24 06:04
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "0002_document_metadata_alter_document_document_type"),
]
operations = [
migrations.RemoveField(
model_name="document",
name="metadata",
),
migrations.AddField(
model_name="document",
name="ocr_metadata",
field=models.TextField(blank=True, null=True),
),
]

View file

@ -1,3 +1,4 @@
from django.db import models
from django.utils.timezone import now
import uuid
@ -7,17 +8,15 @@ class Document(models.Model):
name = models.CharField(max_length=100)
DOCUMENT_TYPE_CHOICES = (
("pdf", "PDF"),
("image", "Image"),
("video", "Video"),
("doc", "Word Document"),
("excel", "Excel Document"),
("ppt", "Powerpoint Document"),
("memorandum", "Memorandum"),
("hoa", "HOA"),
# TODO: Update this list on types of documents
)
document_type = models.CharField(
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
)
number_pages = models.IntegerField(null=False, blank=False)
ocr_metadata = models.TextField(null=True, blank=True)
def upload_to(instance, filename):
_, extension = filename.split(".")

View file

@ -36,12 +36,13 @@ class DocumentSerializer(serializers.ModelSerializer):
class Meta:
model = Document
fields = ["id", "name", "document_type",
"number_pages", "date_uploaded"]
"number_pages", "ocr_metadata", "date_uploaded"]
read_only_fields = [
"id",
"name",
"document_type",
"number_pages",
"ocr_metadata",
"date_uploaded",
]
@ -59,15 +60,17 @@ class DocumentFileSerializer(serializers.ModelSerializer):
"id",
"name",
"document_type",
"file",
"number_pages",
"ocr_metadata",
"date_uploaded",
"file",
]
read_only_fields = [
"id",
"name",
"document_type",
"number_pages",
"ocr_metadata",
"date_uploaded",
"file",
]

View file

@ -0,0 +1,26 @@
from django.db.models.signals import post_save
from django.dispatch import receiver
from config.settings import MEDIA_ROOT
import os
import fitz
import pytesseract
from PIL import Image
from .models import Document
@receiver(post_save, sender=Document)
def domain_post_save(sender, instance, **kwargs):
if not instance.ocr_metadata:
metadata = ""
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
mat = fitz.Matrix(1.2, 1.2)
for page in doc:
pix = page.get_pixmap(matrix=mat)
output = f'{page.number}.jpg'
pix.save(output)
res = str(pytesseract.image_to_string(Image.open(output)))
os.remove(output)
metadata += res
instance.ocr_metadata = metadata
instance.save()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 132 KiB

After

Width:  |  Height:  |  Size: 132 KiB

View file

@ -11,6 +11,7 @@ defusedxml==0.8.0rc2
Django==5.1.3
django-cleanup==9.0.0
django-cors-headers==4.6.0
django-extensions==3.2.3
django-rest-framework==0.1.0
django-unfold==0.41.0
djangorestframework==3.15.2
@ -18,18 +19,28 @@ djangorestframework-simplejwt==5.3.1
djoser==2.3.1
drf-spectacular==0.27.2
drf-spectacular-sidecar==2024.11.1
filelock==3.16.1
fsspec==2024.10.0
gunicorn==23.0.0
idna==3.10
inflection==0.5.1
Jinja2==3.1.4
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
MarkupSafe==2.1.5
mpmath==1.3.0
mypy-extensions==1.0.0
networkx==3.4.2
oauthlib==3.2.2
packaging==24.2
pathspec==0.12.1
pillow==11.0.0
platformdirs==4.3.6
pycparser==2.22
pyflakes==3.2.0
PyJWT==2.10.0
PyMuPDF==1.24.14
pytesseract==0.3.13
python-dotenv==1.0.1
python3-openid==3.2.0
PyYAML==6.0.2
@ -37,12 +48,14 @@ referencing==0.35.1
requests==2.32.3
requests-oauthlib==2.0.0
rpds-py==0.21.0
setuptools==70.2.0
social-auth-app-django==5.4.2
social-auth-core==4.5.4
sqlparse==0.5.2
sympy==1.13.1
typing_extensions==4.12.2
tzdata==2024.2
uritemplate==4.1.1
urllib3==2.2.3
whitenoise==6.8.2
django-extensions==3.2.3
pygraphviz==1.14; platform_system == 'Linux'