Add Ollama integration

2025-07-03 02:44:12 +08:00 · 2024-12-07 02:44:45 +08:00 · 2024-12-07 02:44:45 +08:00 · f39f5966d6
commit f39f5966d6
parent 9289166c0e
2 changed files with 87 additions and 9 deletions
--- a/.env.sample
+++ b/.env.sample
@ -21,3 +21,8 @@ ADMIN_PASSWORD = ''
 # To insert test data or not (UNUSED)
 TEST_DATA = "True"
 # Ollama for Categorization 
 OLLAMA_URL = ""
 OLLAMA_USERNAME = ""
 OLLAMA_PASSWORD = ""
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@ -1,3 +1,6 @@
 from ollama import ChatResponse
 import base64
 import httpx
 from django.core.management.base import BaseCommand, CommandError
 from io import BytesIO
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 from documents.models import Document
 from config.settings import get_secret
 from django.core.files import File
 import logging
 import time
 from ollama import Client
 class PDFHandler(FileSystemEventHandler):
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):
    def process_pdf(self, file_path):
        try:
-            filename = os.path.basename(file_path)
+            # Get the original filename and directory
-            filename = str(filename).replace(" ", "")
+            original_filename = os.path.basename(file_path)
            original_dir = os.path.dirname(file_path)
            # Check if the filename contains spaces
            if " " in original_filename:
                # Create the new filename by replacing spaces
                new_filename = original_filename.replace(" ", "_")
                # Construct the new full file path
                new_file_path = os.path.join(original_dir, new_filename)
                # Rename the file
                os.rename(file_path, new_file_path)
                # Update the filename and file_path variables
                filename = new_filename
                file_path = new_file_path
            else:
                filename = original_filename
            metadata = ""
            document_type = ""
@ -60,14 +83,64 @@ class PDFHandler(FileSystemEventHandler):
                    # Perform OCR
                    text = pytesseract.image_to_string(img).strip()
-                    lines = text.split("\n")
+                    # Get document category
                    # Try to pass image to the Ollama image recognition API first
                    try:
                        client = Client(
                            host=get_secret("OLLAMA_URL"),
                            auth=httpx.BasicAuth(
                                username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD"))
                        )
-                    for line in lines:
+                        encoded_image = base64.b64encode(
-                        if line.strip():
+                            img_buffer.getvalue()).decode()
-                            document_type = line.strip().lower()
+
                        attempts = 0
                        while True:
                            if attempts >= 3:
                                raise Exception(
                                    "Unable to categorize using Ollama API")
                            attempts += 1
                            content = f"""
                            Read the text from the image and provide a category.
                            Possible categories are: Announcement, Manual, Form
                            Respond only with the category. No explanations are necessary.
                            """
                            response: ChatResponse = client.chat(
                                model="llama3.2-vision",
                                messages=[
                                    {"role": "user", "content": content,
                                        "images": [encoded_image]},
                                ],
                            )
                            document_type = response["message"]["content"].split(":")[
                                0].replace("*", "").replace(".", "")
                            # A few safety checks if the model does not follow through with output instructions
                            if len(document_type) > 16:
                                self.logger.warning(
                                    f"Ollama API gave incorrect document category: {response["message"]["content"]}. Retrying...")
                            break
-                    if not document_type:
+
-                        document_type = "other"
+                    # If that fails, just use regular OCR read the title as a dirty fix/fallback
                    except Exception as e:
                        self.logger.warning(f"Error! {e}")
                        self.logger.warning(
                            "Ollama OCR offloading failed. Falling back to default OCR")
                        lines = text.split("\n")
                        for line in lines:
                            if line.strip():
                                document_type = line.strip().lower()
                                break
                        if not document_type:
                            document_type = "other"
                    metadata += text