DocManagerBackend/docmanager_backend/config/management/commands/start_watcher.py

125 lines
3.7 KiB
Python

from django.core.management.base import BaseCommand, CommandError
from io import BytesIO
from documents.models import Document
from PIL import Image
import pytesseract
import fitz
import os
from config.settings import MEDIA_ROOT
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from documents.models import Document
from django.core.files import File
import logging
import time
class PDFHandler(FileSystemEventHandler):
def __init__(self):
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
self.logger = logging.getLogger(__name__)
self.logger.info("Starting Document Watcher...")
def on_created(self, event):
if event.is_directory:
return None
if event.src_path.endswith(".pdf"):
self.logger.info(f"New PDF file detected: {event.src_path}")
self.process_pdf(event.src_path)
def process_pdf(self, file_path):
try:
filename = os.path.basename(file_path)
metadata = ""
document_type = ""
with fitz.open(file_path) as doc:
num_pages = len(doc)
for page_num in range(num_pages):
page = doc[page_num]
pix = page.get_pixmap(matrix=(1.2, 1.2))
# Convert pixmap to bytes
img_bytes = pix.tobytes()
# Create a BytesIO object
img_buffer = BytesIO(img_bytes)
# Create a PIL Image object from the bytes
img = Image.open(img_buffer)
# Perform OCR
text = pytesseract.image_to_string(img).strip()
lines = text.split("\n")
for line in lines:
if line.strip():
document_type = line.strip().lower()
break
if (
not document_type
or document_type not in Document.DOCUMENT_TYPE_CHOICES
):
document_type = "other"
metadata += text
# Open the file for instance creation
DOCUMENT, created = Document.objects.get_or_create(
name=filename,
defaults={
"number_pages": num_pages,
"ocr_metadata": metadata,
"document_type": document_type,
},
)
if created:
DOCUMENT.file.save(name=filename, content=File(open(file_path, "rb")))
self.logger.info(
f"Document '{filename}' created successfully with type '{document_type}'."
)
else:
self.logger.info(f"Document '{filename}' already exists.")
os.remove(file_path)
except Exception as e:
self.logger.error(f"Error processing PDF: {str(e)}")
class PDFWatcher:
def __init__(self):
self.observer = Observer()
def run(self):
event_handler = PDFHandler()
watch_directory = os.path.join(MEDIA_ROOT, "uploads")
self.observer.schedule(event_handler, watch_directory, recursive=True)
self.observer.start()
try:
while True:
time.sleep(5)
except:
self.observer.stop()
self.observer.join()
class Command(BaseCommand):
help = "Runs a dedicated file watcher service"
def handle(self, *args, **options):
watcher = PDFWatcher()
watcher.run()