Add Ollama integration

2025-09-18 13:39:48 +08:00 · 2024-12-07 02:44:45 +08:00 · 2024-12-07 02:44:45 +08:00 · f39f5966d6
commit f39f5966d6
parent 9289166c0e
2 changed files with 87 additions and 9 deletions
--- a/.env.sample
+++ b/.env.sample
@ -20,4 +20,9 @@ ADMIN_EMAIL = 'admin@test.com'
 ADMIN_PASSWORD = ''

 # To insert test data or not (UNUSED)
-TEST_DATA = "True"
+TEST_DATA = "True"
+
+# Ollama for Categorization 
+OLLAMA_URL = ""
+OLLAMA_USERNAME = ""
+OLLAMA_PASSWORD = ""
--- a/docmanager_backend/config/management/commands/start_watcher.py
+++ b/docmanager_backend/config/management/commands/start_watcher.py
@ -1,3 +1,6 @@
+from ollama import ChatResponse
+import base64
+import httpx
 from django.core.management.base import BaseCommand, CommandError

 from io import BytesIO
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 from documents.models import Document
+from config.settings import get_secret
 from django.core.files import File
 import logging
 import time
+from ollama import Client


 class PDFHandler(FileSystemEventHandler):
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):

    def process_pdf(self, file_path):
        try:
-            filename = os.path.basename(file_path)
-            filename = str(filename).replace(" ", "")
+            # Get the original filename and directory
+            original_filename = os.path.basename(file_path)
+            original_dir = os.path.dirname(file_path)
+
+            # Check if the filename contains spaces
+            if " " in original_filename:
+                # Create the new filename by replacing spaces
+                new_filename = original_filename.replace(" ", "_")
+
+                # Construct the new full file path
+                new_file_path = os.path.join(original_dir, new_filename)
+
+                # Rename the file
+                os.rename(file_path, new_file_path)
+
+                # Update the filename and file_path variables
+                filename = new_filename
+                file_path = new_file_path
+            else:
+                filename = original_filename
            metadata = ""
            document_type = ""

@ -60,14 +83,64 @@ class PDFHandler(FileSystemEventHandler):
                    # Perform OCR
                    text = pytesseract.image_to_string(img).strip()

-                    lines = text.split("\n")
+                    # Get document category
+                    # Try to pass image to the Ollama image recognition API first
+                    try:
+                        client = Client(
+                            host=get_secret("OLLAMA_URL"),
+                            auth=httpx.BasicAuth(
+                                username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD"))
+                        )

-                    for line in lines:
-                        if line.strip():
-                            document_type = line.strip().lower()
+                        encoded_image = base64.b64encode(
+                            img_buffer.getvalue()).decode()
+
+                        attempts = 0
+                        while True:
+                            if attempts >= 3:
+                                raise Exception(
+                                    "Unable to categorize using Ollama API")
+                            attempts += 1
+
+                            content = f"""
+                            Read the text from the image and provide a category.
+
+                            Possible categories are: Announcement, Manual, Form
+
+                            Respond only with the category. No explanations are necessary.
+                            """
+
+                            response: ChatResponse = client.chat(
+                                model="llama3.2-vision",
+                                messages=[
+                                    {"role": "user", "content": content,
+                                        "images": [encoded_image]},
+                                ],
+                            )
+
+                            document_type = response["message"]["content"].split(":")[
+                                0].replace("*", "").replace(".", "")
+
+                            # A few safety checks if the model does not follow through with output instructions
+                            if len(document_type) > 16:
+                                self.logger.warning(
+                                    f"Ollama API gave incorrect document category: {response["message"]["content"]}. Retrying...")
                            break
-                    if not document_type:
-                        document_type = "other"
+
+                    # If that fails, just use regular OCR read the title as a dirty fix/fallback
+                    except Exception as e:
+                        self.logger.warning(f"Error! {e}")
+                        self.logger.warning(
+                            "Ollama OCR offloading failed. Falling back to default OCR")
+                        lines = text.split("\n")
+
+                        for line in lines:
+                            if line.strip():
+                                document_type = line.strip().lower()
+                                break
+
+                        if not document_type:
+                            document_type = "other"

                    metadata += text