Compare commits

..

3 commits

8 changed files with 143 additions and 34 deletions

View file

@ -21,3 +21,10 @@ ADMIN_PASSWORD = ''
# To insert test data or not (UNUSED) # To insert test data or not (UNUSED)
TEST_DATA = "True" TEST_DATA = "True"
# Ollama for Categorization
OLLAMA_URL = "localhost:11434"
OLLAMA_USE_AUTH = "False"
OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16"
OLLAMA_USERNAME = ""
OLLAMA_PASSWORD = ""

View file

@ -8,7 +8,8 @@ class IsStaff(BasePermission):
def has_permission(self, request, view): def has_permission(self, request, view):
return bool( return bool(
request.user and request.user.role in ("head", "admin", "planning", "staff") request.user and request.user.role in (
"head", "admin", "planning", "staff")
) )
@ -18,7 +19,7 @@ class IsPlanning(BasePermission):
""" """
def has_permission(self, request, view): def has_permission(self, request, view):
return bool(request.user and request.user.role in ("head", "admin", "planning")) return bool(request.user and request.user.role == "planning")
class IsHead(BasePermission): class IsHead(BasePermission):

View file

@ -6,7 +6,7 @@ from rest_framework.settings import api_settings
class CustomUserSerializer(serializers.ModelSerializer): class CustomUserSerializer(serializers.ModelSerializer):
birthday = serializers.DateField(format="%m-%d-%Y") birthday = serializers.DateField(format="%Y-%m-%d")
class Meta: class Meta:
model = CustomUser model = CustomUser

View file

@ -10,19 +10,48 @@ from .models import CustomUser
def create_admin_user(sender, **kwargs): def create_admin_user(sender, **kwargs):
# Programatically creates the administrator account # Programatically creates the administrator account
if sender.name == "accounts": if sender.name == "accounts":
ADMIN_USER = CustomUser.objects.filter( users = [{
email=get_secret("ADMIN_EMAIL")).first() "email": get_secret("ADMIN_EMAIL"),
if not ADMIN_USER: "role": "head",
ADMIN_USER = CustomUser.objects.create_superuser( "admin": True,
username=get_secret("ADMIN_EMAIL"), }, {
email=get_secret("ADMIN_EMAIL"), "email": "staff@test.com",
"role": "staff",
"admin": False,
}, {
"email": "planning@test.com",
"role": "planning",
"admin": False,
}, {
"email": "client@test.com",
"role": "client",
"admin": False,
},]
for user in users:
USER = CustomUser.objects.filter(
email=user["email"]).first()
if not USER:
if user["admin"]:
USER = CustomUser.objects.create_superuser(
username=user["email"],
email=user["email"],
password=get_secret("ADMIN_PASSWORD"), password=get_secret("ADMIN_PASSWORD"),
sex="male", sex="male",
birthday=localdate(now()), birthday=localdate(now()),
role=user["role"]
) )
else:
USER = CustomUser.objects.create_user(
username=user["email"],
email=user["email"],
password=get_secret("ADMIN_PASSWORD"),
sex="male",
birthday=localdate(now()),
role=user["role"]
print("Created administrator account:", ADMIN_USER.email) )
print(f"Created {user['role']} account: {USER.email}")
ADMIN_USER.first_name = "Administrator" USER.first_name = f"DEBUG_USER:{USER.email}"
ADMIN_USER.is_active = True USER.is_active = True
ADMIN_USER.save() USER.save()

View file

@ -1,4 +1,7 @@
from django.core.management.base import BaseCommand, CommandError from ollama import ChatResponse
import base64
import httpx
from django.core.management.base import BaseCommand
from io import BytesIO from io import BytesIO
from documents.models import Document from documents.models import Document
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
from watchdog.observers import Observer from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler from watchdog.events import FileSystemEventHandler
from documents.models import Document from documents.models import Document
from config.settings import get_secret
from django.core.files import File from django.core.files import File
import logging import logging
import time import time
from ollama import Client
class PDFHandler(FileSystemEventHandler): class PDFHandler(FileSystemEventHandler):
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):
def process_pdf(self, file_path): def process_pdf(self, file_path):
try: try:
filename = os.path.basename(file_path) # Get the original filename and directory
filename = str(filename).replace(" ", "") original_filename = os.path.basename(file_path)
original_dir = os.path.dirname(file_path)
# Check if the filename contains spaces
if " " in original_filename:
# Create the new filename by replacing spaces
new_filename = original_filename.replace(" ", "_")
# Construct the new full file path
new_file_path = os.path.join(original_dir, new_filename)
# Rename the file
os.rename(file_path, new_file_path)
# Update the filename and file_path variables
filename = new_filename
file_path = new_file_path
else:
filename = original_filename
metadata = "" metadata = ""
document_type = "" document_type = ""
@ -60,12 +83,62 @@ class PDFHandler(FileSystemEventHandler):
# Perform OCR # Perform OCR
text = pytesseract.image_to_string(img).strip() text = pytesseract.image_to_string(img).strip()
# Get document category
# Try to pass image to the Ollama image recognition API first
try:
client = Client(
host=get_secret("OLLAMA_URL"),
auth=httpx.BasicAuth(
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None
)
encoded_image = base64.b64encode(
img_buffer.getvalue()).decode()
attempts = 0
while True:
if attempts >= 3:
raise Exception(
"Unable to categorize using Ollama API")
attempts += 1
content = f"""
Read the text from the image and provide a category.
Possible categories are: Announcement, Manual, Form
Respond only with the category. No explanations are necessary.
"""
response: ChatResponse = client.chat(
model=get_secret("OLLAMA_MODEL"),
messages=[
{"role": "user", "content": content,
"images": [encoded_image]},
],
)
document_type = response["message"]["content"].replace(
"*", "").replace(".", "")
# A few safety checks if the model does not follow through with output instructions
if len(document_type) > 16:
self.logger.warning(
f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...")
break
# If that fails, just use regular OCR read the title as a dirty fix/fallback
except Exception as e:
self.logger.warning(f"Error! {e}")
self.logger.warning(
"Ollama OCR offloading failed. Falling back to default OCR")
lines = text.split("\n") lines = text.split("\n")
for line in lines: for line in lines:
if line.strip(): if line.strip():
document_type = line.strip().lower() document_type = line.strip().lower()
break break
if not document_type: if not document_type:
document_type = "other" document_type = "other"
@ -73,7 +146,7 @@ class PDFHandler(FileSystemEventHandler):
# Open the file for instance creation # Open the file for instance creation
DOCUMENT, created = Document.objects.get_or_create( DOCUMENT, created = Document.objects.get_or_create(
name=filename, name=filename.replace(".pdf", ""),
defaults={ defaults={
"number_pages": num_pages, "number_pages": num_pages,
"ocr_metadata": metadata, "ocr_metadata": metadata,
@ -85,8 +158,7 @@ class PDFHandler(FileSystemEventHandler):
DOCUMENT.file.save( DOCUMENT.file.save(
name=filename, content=File(open(file_path, "rb"))) name=filename, content=File(open(file_path, "rb")))
self.logger.info( self.logger.info(
f"Document '{filename}' created successfully with type '{ f"Document '{filename}' created successfully with type '{document_type}'."
document_type}'."
) )
else: else:

View file

@ -8,4 +8,4 @@ from .models import Document
class DocumentAdmin(ModelAdmin): class DocumentAdmin(ModelAdmin):
model = Document model = Document
search_fields = ["id", "name", "document_type"] search_fields = ["id", "name", "document_type"]
list_display = ["id", "name", "document_type"] list_display = ["id", "name", "document_type", "date_uploaded"]

View file

@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView):
http_method_names = ["get"] http_method_names = ["get"]
serializer_class = DocumentSerializer serializer_class = DocumentSerializer
queryset = Document.objects.all() queryset = Document.objects.all().order_by("-date_uploaded")
pagination_class = PageNumberPagination pagination_class = PageNumberPagination
permission_classes = [IsAuthenticated] permission_classes = [IsAuthenticated]

View file

@ -10,7 +10,7 @@
{% trans 'Please visit the site to check your request:' %} {% trans 'Please visit the site to check your request:' %}
{{ url|safe }} {{ url|safe }}
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
{% endblock %} {% endblock %}
{% block html_body %} {% block html_body %}
@ -27,6 +27,6 @@
</p> </p>
<p> <p>
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %} {% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
</p> </p>
{% endblock %} {% endblock %}