Compare commits

...

3 commits

8 changed files with 143 additions and 34 deletions

View file

@ -20,4 +20,11 @@ ADMIN_EMAIL = 'admin@test.com'
ADMIN_PASSWORD = ''
# To insert test data or not (UNUSED)
TEST_DATA = "True"
TEST_DATA = "True"
# Ollama for Categorization
OLLAMA_URL = "localhost:11434"
OLLAMA_USE_AUTH = "False"
OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16"
OLLAMA_USERNAME = ""
OLLAMA_PASSWORD = ""

View file

@ -8,7 +8,8 @@ class IsStaff(BasePermission):
def has_permission(self, request, view):
return bool(
request.user and request.user.role in ("head", "admin", "planning", "staff")
request.user and request.user.role in (
"head", "admin", "planning", "staff")
)
@ -18,7 +19,7 @@ class IsPlanning(BasePermission):
"""
def has_permission(self, request, view):
return bool(request.user and request.user.role in ("head", "admin", "planning"))
return bool(request.user and request.user.role == "planning")
class IsHead(BasePermission):

View file

@ -6,7 +6,7 @@ from rest_framework.settings import api_settings
class CustomUserSerializer(serializers.ModelSerializer):
birthday = serializers.DateField(format="%m-%d-%Y")
birthday = serializers.DateField(format="%Y-%m-%d")
class Meta:
model = CustomUser

View file

@ -10,19 +10,48 @@ from .models import CustomUser
def create_admin_user(sender, **kwargs):
# Programatically creates the administrator account
if sender.name == "accounts":
ADMIN_USER = CustomUser.objects.filter(
email=get_secret("ADMIN_EMAIL")).first()
if not ADMIN_USER:
ADMIN_USER = CustomUser.objects.create_superuser(
username=get_secret("ADMIN_EMAIL"),
email=get_secret("ADMIN_EMAIL"),
password=get_secret("ADMIN_PASSWORD"),
sex="male",
birthday=localdate(now()),
)
users = [{
"email": get_secret("ADMIN_EMAIL"),
"role": "head",
"admin": True,
}, {
"email": "staff@test.com",
"role": "staff",
"admin": False,
}, {
"email": "planning@test.com",
"role": "planning",
"admin": False,
}, {
"email": "client@test.com",
"role": "client",
"admin": False,
},]
for user in users:
USER = CustomUser.objects.filter(
email=user["email"]).first()
if not USER:
if user["admin"]:
USER = CustomUser.objects.create_superuser(
username=user["email"],
email=user["email"],
password=get_secret("ADMIN_PASSWORD"),
sex="male",
birthday=localdate(now()),
role=user["role"]
)
else:
USER = CustomUser.objects.create_user(
username=user["email"],
email=user["email"],
password=get_secret("ADMIN_PASSWORD"),
sex="male",
birthday=localdate(now()),
role=user["role"]
print("Created administrator account:", ADMIN_USER.email)
)
print(f"Created {user['role']} account: {USER.email}")
ADMIN_USER.first_name = "Administrator"
ADMIN_USER.is_active = True
ADMIN_USER.save()
USER.first_name = f"DEBUG_USER:{USER.email}"
USER.is_active = True
USER.save()

View file

@ -1,4 +1,7 @@
from django.core.management.base import BaseCommand, CommandError
from ollama import ChatResponse
import base64
import httpx
from django.core.management.base import BaseCommand
from io import BytesIO
from documents.models import Document
@ -10,9 +13,11 @@ from config.settings import MEDIA_ROOT
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from documents.models import Document
from config.settings import get_secret
from django.core.files import File
import logging
import time
from ollama import Client
class PDFHandler(FileSystemEventHandler):
@ -36,8 +41,26 @@ class PDFHandler(FileSystemEventHandler):
def process_pdf(self, file_path):
try:
filename = os.path.basename(file_path)
filename = str(filename).replace(" ", "")
# Get the original filename and directory
original_filename = os.path.basename(file_path)
original_dir = os.path.dirname(file_path)
# Check if the filename contains spaces
if " " in original_filename:
# Create the new filename by replacing spaces
new_filename = original_filename.replace(" ", "_")
# Construct the new full file path
new_file_path = os.path.join(original_dir, new_filename)
# Rename the file
os.rename(file_path, new_file_path)
# Update the filename and file_path variables
filename = new_filename
file_path = new_file_path
else:
filename = original_filename
metadata = ""
document_type = ""
@ -60,20 +83,70 @@ class PDFHandler(FileSystemEventHandler):
# Perform OCR
text = pytesseract.image_to_string(img).strip()
lines = text.split("\n")
# Get document category
# Try to pass image to the Ollama image recognition API first
try:
client = Client(
host=get_secret("OLLAMA_URL"),
auth=httpx.BasicAuth(
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None
)
for line in lines:
if line.strip():
document_type = line.strip().lower()
encoded_image = base64.b64encode(
img_buffer.getvalue()).decode()
attempts = 0
while True:
if attempts >= 3:
raise Exception(
"Unable to categorize using Ollama API")
attempts += 1
content = f"""
Read the text from the image and provide a category.
Possible categories are: Announcement, Manual, Form
Respond only with the category. No explanations are necessary.
"""
response: ChatResponse = client.chat(
model=get_secret("OLLAMA_MODEL"),
messages=[
{"role": "user", "content": content,
"images": [encoded_image]},
],
)
document_type = response["message"]["content"].replace(
"*", "").replace(".", "")
# A few safety checks if the model does not follow through with output instructions
if len(document_type) > 16:
self.logger.warning(
f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...")
break
if not document_type:
document_type = "other"
# If that fails, just use regular OCR read the title as a dirty fix/fallback
except Exception as e:
self.logger.warning(f"Error! {e}")
self.logger.warning(
"Ollama OCR offloading failed. Falling back to default OCR")
lines = text.split("\n")
for line in lines:
if line.strip():
document_type = line.strip().lower()
break
if not document_type:
document_type = "other"
metadata += text
# Open the file for instance creation
DOCUMENT, created = Document.objects.get_or_create(
name=filename,
name=filename.replace(".pdf", ""),
defaults={
"number_pages": num_pages,
"ocr_metadata": metadata,
@ -85,8 +158,7 @@ class PDFHandler(FileSystemEventHandler):
DOCUMENT.file.save(
name=filename, content=File(open(file_path, "rb")))
self.logger.info(
f"Document '{filename}' created successfully with type '{
document_type}'."
f"Document '{filename}' created successfully with type '{document_type}'."
)
else:

View file

@ -8,4 +8,4 @@ from .models import Document
class DocumentAdmin(ModelAdmin):
model = Document
search_fields = ["id", "name", "document_type"]
list_display = ["id", "name", "document_type"]
list_display = ["id", "name", "document_type", "date_uploaded"]

View file

@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView):
http_method_names = ["get"]
serializer_class = DocumentSerializer
queryset = Document.objects.all()
queryset = Document.objects.all().order_by("-date_uploaded")
pagination_class = PageNumberPagination
permission_classes = [IsAuthenticated]

View file

@ -10,7 +10,7 @@
{% trans 'Please visit the site to check your request:' %}
{{ url|safe }}
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
{% endblock %}
{% block html_body %}
@ -27,6 +27,6 @@
</p>
<p>
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
</p>
{% endblock %}