2024-11-27 00:32:28 +08:00
|
|
|
from django.core.management.base import BaseCommand, CommandError
|
|
|
|
|
|
|
|
from io import BytesIO
|
|
|
|
from documents.models import Document
|
|
|
|
from PIL import Image
|
|
|
|
import pytesseract
|
|
|
|
import fitz
|
|
|
|
import os
|
|
|
|
from config.settings import MEDIA_ROOT
|
|
|
|
from watchdog.observers import Observer
|
|
|
|
from watchdog.events import FileSystemEventHandler
|
|
|
|
from documents.models import Document
|
2024-11-28 13:41:58 +08:00
|
|
|
from django.core.files import File
|
2024-11-27 00:32:28 +08:00
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
class PDFHandler(FileSystemEventHandler):
|
|
|
|
def __init__(self):
|
2024-12-04 01:29:30 +08:00
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.INFO,
|
|
|
|
format="%(asctime)s - %(message)s",
|
|
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
|
|
)
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
self.logger.info("Starting Document Watcher...")
|
|
|
|
|
|
|
|
def on_created(self, event):
|
|
|
|
if event.is_directory:
|
|
|
|
return None
|
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
if event.src_path.endswith(".pdf"):
|
2024-11-27 00:32:28 +08:00
|
|
|
self.logger.info(f"New PDF file detected: {event.src_path}")
|
|
|
|
self.process_pdf(event.src_path)
|
|
|
|
|
|
|
|
def process_pdf(self, file_path):
|
|
|
|
try:
|
|
|
|
filename = os.path.basename(file_path)
|
|
|
|
metadata = ""
|
|
|
|
document_type = ""
|
|
|
|
|
|
|
|
with fitz.open(file_path) as doc:
|
|
|
|
num_pages = len(doc)
|
|
|
|
|
|
|
|
for page_num in range(num_pages):
|
|
|
|
page = doc[page_num]
|
|
|
|
pix = page.get_pixmap(matrix=(1.2, 1.2))
|
|
|
|
|
|
|
|
# Convert pixmap to bytes
|
|
|
|
img_bytes = pix.tobytes()
|
|
|
|
|
|
|
|
# Create a BytesIO object
|
|
|
|
img_buffer = BytesIO(img_bytes)
|
|
|
|
|
|
|
|
# Create a PIL Image object from the bytes
|
|
|
|
img = Image.open(img_buffer)
|
|
|
|
|
|
|
|
# Perform OCR
|
|
|
|
text = pytesseract.image_to_string(img).strip()
|
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
lines = text.split("\n")
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
if line.strip():
|
|
|
|
document_type = line.strip().lower()
|
|
|
|
break
|
2024-12-04 01:29:30 +08:00
|
|
|
if (
|
|
|
|
not document_type
|
|
|
|
or document_type not in Document.DOCUMENT_TYPE_CHOICES
|
|
|
|
):
|
2024-11-27 00:32:28 +08:00
|
|
|
document_type = "other"
|
|
|
|
|
|
|
|
metadata += text
|
|
|
|
|
2024-11-28 13:41:58 +08:00
|
|
|
# Open the file for instance creation
|
|
|
|
DOCUMENT, created = Document.objects.get_or_create(
|
2024-11-27 00:32:28 +08:00
|
|
|
name=filename,
|
|
|
|
defaults={
|
2024-12-04 01:29:30 +08:00
|
|
|
"number_pages": num_pages,
|
|
|
|
"ocr_metadata": metadata,
|
|
|
|
"document_type": document_type,
|
2024-11-28 13:41:58 +08:00
|
|
|
},
|
2024-11-27 00:32:28 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
if created:
|
2024-12-04 01:29:30 +08:00
|
|
|
DOCUMENT.file.save(name=filename, content=File(open(file_path, "rb")))
|
|
|
|
self.logger.info(
|
|
|
|
f"Document '{filename}' created successfully with type '{document_type}'."
|
|
|
|
)
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
else:
|
|
|
|
self.logger.info(f"Document '{filename}' already exists.")
|
|
|
|
|
|
|
|
os.remove(file_path)
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.error(f"Error processing PDF: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
class PDFWatcher:
|
|
|
|
def __init__(self):
|
|
|
|
self.observer = Observer()
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
event_handler = PDFHandler()
|
2024-11-27 00:49:20 +08:00
|
|
|
watch_directory = os.path.join(MEDIA_ROOT, "uploads")
|
2024-11-27 00:32:28 +08:00
|
|
|
|
2024-12-04 01:29:30 +08:00
|
|
|
self.observer.schedule(event_handler, watch_directory, recursive=True)
|
2024-11-27 00:32:28 +08:00
|
|
|
self.observer.start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
time.sleep(5)
|
|
|
|
except:
|
|
|
|
self.observer.stop()
|
|
|
|
|
|
|
|
self.observer.join()
|
|
|
|
|
|
|
|
|
|
|
|
class Command(BaseCommand):
|
2024-11-27 00:35:59 +08:00
|
|
|
help = "Runs a dedicated file watcher service"
|
2024-11-27 00:32:28 +08:00
|
|
|
|
|
|
|
def handle(self, *args, **options):
|
|
|
|
watcher = PDFWatcher()
|
|
|
|
watcher.run()
|