mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-01-19 01:23:02 +08:00
Add OCR metadata field processing for documents
This commit is contained in:
parent
9b78fdd9ae
commit
b76adb3601
6 changed files with 87 additions and 8 deletions
|
@ -4,3 +4,7 @@ from django.apps import AppConfig
|
||||||
class DocumentsConfig(AppConfig):
|
class DocumentsConfig(AppConfig):
|
||||||
default_auto_field = "django.db.models.BigAutoField"
|
default_auto_field = "django.db.models.BigAutoField"
|
||||||
name = "documents"
|
name = "documents"
|
||||||
|
|
||||||
|
def ready(self) -> None:
|
||||||
|
import documents.signals
|
||||||
|
return super().ready()
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Generated by Django 5.1.3 on 2024-11-24 05:17
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
("documents", "0001_initial"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="document",
|
||||||
|
name="metadata",
|
||||||
|
field=models.TextField(null=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="document",
|
||||||
|
name="document_type",
|
||||||
|
field=models.CharField(
|
||||||
|
choices=[("memorandum", "Memorandum"), ("hoa", "HOA")], max_length=32
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,22 @@
|
||||||
|
# Generated by Django 5.1.3 on 2024-11-24 06:04
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
("documents", "0002_document_metadata_alter_document_document_type"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name="document",
|
||||||
|
name="metadata",
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="document",
|
||||||
|
name="ocr_metadata",
|
||||||
|
field=models.TextField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.timezone import now
|
from django.utils.timezone import now
|
||||||
import uuid
|
import uuid
|
||||||
|
@ -7,17 +8,15 @@ class Document(models.Model):
|
||||||
name = models.CharField(max_length=100)
|
name = models.CharField(max_length=100)
|
||||||
|
|
||||||
DOCUMENT_TYPE_CHOICES = (
|
DOCUMENT_TYPE_CHOICES = (
|
||||||
("pdf", "PDF"),
|
("memorandum", "Memorandum"),
|
||||||
("image", "Image"),
|
("hoa", "HOA"),
|
||||||
("video", "Video"),
|
# TODO: Update this list on types of documents
|
||||||
("doc", "Word Document"),
|
|
||||||
("excel", "Excel Document"),
|
|
||||||
("ppt", "Powerpoint Document"),
|
|
||||||
)
|
)
|
||||||
document_type = models.CharField(
|
document_type = models.CharField(
|
||||||
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
|
max_length=32, choices=DOCUMENT_TYPE_CHOICES, null=False, blank=False
|
||||||
)
|
)
|
||||||
number_pages = models.IntegerField(null=False, blank=False)
|
number_pages = models.IntegerField(null=False, blank=False)
|
||||||
|
ocr_metadata = models.TextField(null=True, blank=True)
|
||||||
|
|
||||||
def upload_to(instance, filename):
|
def upload_to(instance, filename):
|
||||||
_, extension = filename.split(".")
|
_, extension = filename.split(".")
|
||||||
|
|
|
@ -36,12 +36,13 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||||
class Meta:
|
class Meta:
|
||||||
model = Document
|
model = Document
|
||||||
fields = ["id", "name", "document_type",
|
fields = ["id", "name", "document_type",
|
||||||
"number_pages", "date_uploaded"]
|
"number_pages", "ocr_metadata", "date_uploaded"]
|
||||||
read_only_fields = [
|
read_only_fields = [
|
||||||
"id",
|
"id",
|
||||||
"name",
|
"name",
|
||||||
"document_type",
|
"document_type",
|
||||||
"number_pages",
|
"number_pages",
|
||||||
|
"ocr_metadata",
|
||||||
"date_uploaded",
|
"date_uploaded",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -59,15 +60,17 @@ class DocumentFileSerializer(serializers.ModelSerializer):
|
||||||
"id",
|
"id",
|
||||||
"name",
|
"name",
|
||||||
"document_type",
|
"document_type",
|
||||||
"file",
|
|
||||||
"number_pages",
|
"number_pages",
|
||||||
|
"ocr_metadata",
|
||||||
"date_uploaded",
|
"date_uploaded",
|
||||||
|
"file",
|
||||||
]
|
]
|
||||||
read_only_fields = [
|
read_only_fields = [
|
||||||
"id",
|
"id",
|
||||||
"name",
|
"name",
|
||||||
"document_type",
|
"document_type",
|
||||||
"number_pages",
|
"number_pages",
|
||||||
|
"ocr_metadata",
|
||||||
"date_uploaded",
|
"date_uploaded",
|
||||||
"file",
|
"file",
|
||||||
]
|
]
|
||||||
|
|
26
docmanager_backend/documents/signals.py
Normal file
26
docmanager_backend/documents/signals.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
from django.db.models.signals import post_save
|
||||||
|
from django.dispatch import receiver
|
||||||
|
from config.settings import MEDIA_ROOT
|
||||||
|
import os
|
||||||
|
import fitz
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
|
from .models import Document
|
||||||
|
|
||||||
|
|
||||||
|
@receiver(post_save, sender=Document)
|
||||||
|
def domain_post_save(sender, instance, **kwargs):
|
||||||
|
if not instance.ocr_metadata:
|
||||||
|
metadata = ""
|
||||||
|
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
|
||||||
|
mat = fitz.Matrix(1.2, 1.2)
|
||||||
|
for page in doc:
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
output = f'{page.number}.jpg'
|
||||||
|
pix.save(output)
|
||||||
|
res = str(pytesseract.image_to_string(Image.open(output)))
|
||||||
|
os.remove(output)
|
||||||
|
metadata += res
|
||||||
|
|
||||||
|
instance.ocr_metadata = metadata
|
||||||
|
instance.save()
|
Loading…
Reference in a new issue