2024-11-27 00:32:28 +08:00
|
|
|
from io import BytesIO
|
|
|
|
from documents.models import Document
|
2024-11-24 14:08:27 +08:00
|
|
|
from django.db.models.signals import post_save
|
|
|
|
from django.dispatch import receiver
|
|
|
|
from config.settings import MEDIA_ROOT
|
|
|
|
import os
|
|
|
|
import fitz
|
|
|
|
import pytesseract
|
|
|
|
from PIL import Image
|
|
|
|
from .models import Document
|
|
|
|
|
|
|
|
|
|
|
|
@receiver(post_save, sender=Document)
|
2024-11-27 00:32:28 +08:00
|
|
|
def document_post_save(sender, instance, **kwargs):
|
2024-11-24 14:08:27 +08:00
|
|
|
if not instance.ocr_metadata:
|
|
|
|
metadata = ""
|
|
|
|
with fitz.open(os.path.join(MEDIA_ROOT, instance.file.name)) as doc:
|
|
|
|
mat = fitz.Matrix(1.2, 1.2)
|
|
|
|
for page in doc:
|
|
|
|
pix = page.get_pixmap(matrix=mat)
|
2024-11-27 00:32:28 +08:00
|
|
|
# Convert pixmap to bytes
|
|
|
|
img_bytes = pix.tobytes()
|
|
|
|
|
|
|
|
# Create a BytesIO object
|
|
|
|
img_buffer = BytesIO(img_bytes)
|
|
|
|
|
|
|
|
# Create a PIL Image object from the bytes
|
|
|
|
img = Image.open(img_buffer)
|
|
|
|
|
|
|
|
# Perform OCR
|
|
|
|
text = pytesseract.image_to_string(img).strip()
|
|
|
|
metadata += text
|
2024-11-24 14:08:27 +08:00
|
|
|
|
|
|
|
instance.ocr_metadata = metadata
|
|
|
|
instance.save()
|