mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-04-20 08:51:31 +08:00
Compare commits
No commits in common. "844113d44f9f43dd6e5a2b891ca1dc6dde982836" and "9289166c0e272eade8dda8a3541c389adb4261f5" have entirely different histories.
844113d44f
...
9289166c0e
8 changed files with 34 additions and 143 deletions
|
@ -21,10 +21,3 @@ ADMIN_PASSWORD = ''
|
||||||
|
|
||||||
# To insert test data or not (UNUSED)
|
# To insert test data or not (UNUSED)
|
||||||
TEST_DATA = "True"
|
TEST_DATA = "True"
|
||||||
|
|
||||||
# Ollama for Categorization
|
|
||||||
OLLAMA_URL = "localhost:11434"
|
|
||||||
OLLAMA_USE_AUTH = "False"
|
|
||||||
OLLAMA_MODEL = "knoopx/mobile-vlm:3b-fp16"
|
|
||||||
OLLAMA_USERNAME = ""
|
|
||||||
OLLAMA_PASSWORD = ""
|
|
|
@ -8,8 +8,7 @@ class IsStaff(BasePermission):
|
||||||
|
|
||||||
def has_permission(self, request, view):
|
def has_permission(self, request, view):
|
||||||
return bool(
|
return bool(
|
||||||
request.user and request.user.role in (
|
request.user and request.user.role in ("head", "admin", "planning", "staff")
|
||||||
"head", "admin", "planning", "staff")
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -19,7 +18,7 @@ class IsPlanning(BasePermission):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def has_permission(self, request, view):
|
def has_permission(self, request, view):
|
||||||
return bool(request.user and request.user.role == "planning")
|
return bool(request.user and request.user.role in ("head", "admin", "planning"))
|
||||||
|
|
||||||
|
|
||||||
class IsHead(BasePermission):
|
class IsHead(BasePermission):
|
||||||
|
|
|
@ -6,7 +6,7 @@ from rest_framework.settings import api_settings
|
||||||
|
|
||||||
|
|
||||||
class CustomUserSerializer(serializers.ModelSerializer):
|
class CustomUserSerializer(serializers.ModelSerializer):
|
||||||
birthday = serializers.DateField(format="%Y-%m-%d")
|
birthday = serializers.DateField(format="%m-%d-%Y")
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
model = CustomUser
|
model = CustomUser
|
||||||
|
|
|
@ -10,48 +10,19 @@ from .models import CustomUser
|
||||||
def create_admin_user(sender, **kwargs):
|
def create_admin_user(sender, **kwargs):
|
||||||
# Programatically creates the administrator account
|
# Programatically creates the administrator account
|
||||||
if sender.name == "accounts":
|
if sender.name == "accounts":
|
||||||
users = [{
|
ADMIN_USER = CustomUser.objects.filter(
|
||||||
"email": get_secret("ADMIN_EMAIL"),
|
email=get_secret("ADMIN_EMAIL")).first()
|
||||||
"role": "head",
|
if not ADMIN_USER:
|
||||||
"admin": True,
|
ADMIN_USER = CustomUser.objects.create_superuser(
|
||||||
}, {
|
username=get_secret("ADMIN_EMAIL"),
|
||||||
"email": "staff@test.com",
|
email=get_secret("ADMIN_EMAIL"),
|
||||||
"role": "staff",
|
|
||||||
"admin": False,
|
|
||||||
}, {
|
|
||||||
"email": "planning@test.com",
|
|
||||||
"role": "planning",
|
|
||||||
"admin": False,
|
|
||||||
}, {
|
|
||||||
"email": "client@test.com",
|
|
||||||
"role": "client",
|
|
||||||
"admin": False,
|
|
||||||
},]
|
|
||||||
for user in users:
|
|
||||||
USER = CustomUser.objects.filter(
|
|
||||||
email=user["email"]).first()
|
|
||||||
if not USER:
|
|
||||||
if user["admin"]:
|
|
||||||
USER = CustomUser.objects.create_superuser(
|
|
||||||
username=user["email"],
|
|
||||||
email=user["email"],
|
|
||||||
password=get_secret("ADMIN_PASSWORD"),
|
password=get_secret("ADMIN_PASSWORD"),
|
||||||
sex="male",
|
sex="male",
|
||||||
birthday=localdate(now()),
|
birthday=localdate(now()),
|
||||||
role=user["role"]
|
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
USER = CustomUser.objects.create_user(
|
|
||||||
username=user["email"],
|
|
||||||
email=user["email"],
|
|
||||||
password=get_secret("ADMIN_PASSWORD"),
|
|
||||||
sex="male",
|
|
||||||
birthday=localdate(now()),
|
|
||||||
role=user["role"]
|
|
||||||
|
|
||||||
)
|
print("Created administrator account:", ADMIN_USER.email)
|
||||||
print(f"Created {user['role']} account: {USER.email}")
|
|
||||||
|
|
||||||
USER.first_name = f"DEBUG_USER:{USER.email}"
|
ADMIN_USER.first_name = "Administrator"
|
||||||
USER.is_active = True
|
ADMIN_USER.is_active = True
|
||||||
USER.save()
|
ADMIN_USER.save()
|
||||||
|
|
|
@ -1,7 +1,4 @@
|
||||||
from ollama import ChatResponse
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
import base64
|
|
||||||
import httpx
|
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
@ -13,11 +10,9 @@ from config.settings import MEDIA_ROOT
|
||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from config.settings import get_secret
|
|
||||||
from django.core.files import File
|
from django.core.files import File
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from ollama import Client
|
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(FileSystemEventHandler):
|
class PDFHandler(FileSystemEventHandler):
|
||||||
|
@ -41,26 +36,8 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
|
|
||||||
def process_pdf(self, file_path):
|
def process_pdf(self, file_path):
|
||||||
try:
|
try:
|
||||||
# Get the original filename and directory
|
filename = os.path.basename(file_path)
|
||||||
original_filename = os.path.basename(file_path)
|
filename = str(filename).replace(" ", "")
|
||||||
original_dir = os.path.dirname(file_path)
|
|
||||||
|
|
||||||
# Check if the filename contains spaces
|
|
||||||
if " " in original_filename:
|
|
||||||
# Create the new filename by replacing spaces
|
|
||||||
new_filename = original_filename.replace(" ", "_")
|
|
||||||
|
|
||||||
# Construct the new full file path
|
|
||||||
new_file_path = os.path.join(original_dir, new_filename)
|
|
||||||
|
|
||||||
# Rename the file
|
|
||||||
os.rename(file_path, new_file_path)
|
|
||||||
|
|
||||||
# Update the filename and file_path variables
|
|
||||||
filename = new_filename
|
|
||||||
file_path = new_file_path
|
|
||||||
else:
|
|
||||||
filename = original_filename
|
|
||||||
metadata = ""
|
metadata = ""
|
||||||
document_type = ""
|
document_type = ""
|
||||||
|
|
||||||
|
@ -83,62 +60,12 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
# Perform OCR
|
# Perform OCR
|
||||||
text = pytesseract.image_to_string(img).strip()
|
text = pytesseract.image_to_string(img).strip()
|
||||||
|
|
||||||
# Get document category
|
|
||||||
# Try to pass image to the Ollama image recognition API first
|
|
||||||
try:
|
|
||||||
client = Client(
|
|
||||||
host=get_secret("OLLAMA_URL"),
|
|
||||||
auth=httpx.BasicAuth(
|
|
||||||
username=get_secret("OLLAMA_USERNAME"), password=get_secret("OLLAMA_PASSWORD")) if get_secret("OLLAMA_USE_AUTH") else None
|
|
||||||
)
|
|
||||||
|
|
||||||
encoded_image = base64.b64encode(
|
|
||||||
img_buffer.getvalue()).decode()
|
|
||||||
|
|
||||||
attempts = 0
|
|
||||||
while True:
|
|
||||||
if attempts >= 3:
|
|
||||||
raise Exception(
|
|
||||||
"Unable to categorize using Ollama API")
|
|
||||||
attempts += 1
|
|
||||||
|
|
||||||
content = f"""
|
|
||||||
Read the text from the image and provide a category.
|
|
||||||
|
|
||||||
Possible categories are: Announcement, Manual, Form
|
|
||||||
|
|
||||||
Respond only with the category. No explanations are necessary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
response: ChatResponse = client.chat(
|
|
||||||
model=get_secret("OLLAMA_MODEL"),
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": content,
|
|
||||||
"images": [encoded_image]},
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
document_type = response["message"]["content"].replace(
|
|
||||||
"*", "").replace(".", "")
|
|
||||||
|
|
||||||
# A few safety checks if the model does not follow through with output instructions
|
|
||||||
if len(document_type) > 16:
|
|
||||||
self.logger.warning(
|
|
||||||
f"Ollama API gave incorrect document category: {response['message']['content']}. Retrying...")
|
|
||||||
break
|
|
||||||
|
|
||||||
# If that fails, just use regular OCR read the title as a dirty fix/fallback
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Error! {e}")
|
|
||||||
self.logger.warning(
|
|
||||||
"Ollama OCR offloading failed. Falling back to default OCR")
|
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.strip():
|
if line.strip():
|
||||||
document_type = line.strip().lower()
|
document_type = line.strip().lower()
|
||||||
break
|
break
|
||||||
|
|
||||||
if not document_type:
|
if not document_type:
|
||||||
document_type = "other"
|
document_type = "other"
|
||||||
|
|
||||||
|
@ -146,7 +73,7 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
|
|
||||||
# Open the file for instance creation
|
# Open the file for instance creation
|
||||||
DOCUMENT, created = Document.objects.get_or_create(
|
DOCUMENT, created = Document.objects.get_or_create(
|
||||||
name=filename.replace(".pdf", ""),
|
name=filename,
|
||||||
defaults={
|
defaults={
|
||||||
"number_pages": num_pages,
|
"number_pages": num_pages,
|
||||||
"ocr_metadata": metadata,
|
"ocr_metadata": metadata,
|
||||||
|
@ -158,7 +85,8 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
DOCUMENT.file.save(
|
DOCUMENT.file.save(
|
||||||
name=filename, content=File(open(file_path, "rb")))
|
name=filename, content=File(open(file_path, "rb")))
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Document '{filename}' created successfully with type '{document_type}'."
|
f"Document '{filename}' created successfully with type '{
|
||||||
|
document_type}'."
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -8,4 +8,4 @@ from .models import Document
|
||||||
class DocumentAdmin(ModelAdmin):
|
class DocumentAdmin(ModelAdmin):
|
||||||
model = Document
|
model = Document
|
||||||
search_fields = ["id", "name", "document_type"]
|
search_fields = ["id", "name", "document_type"]
|
||||||
list_display = ["id", "name", "document_type", "date_uploaded"]
|
list_display = ["id", "name", "document_type"]
|
||||||
|
|
|
@ -51,7 +51,7 @@ class DocumentListView(generics.ListAPIView):
|
||||||
|
|
||||||
http_method_names = ["get"]
|
http_method_names = ["get"]
|
||||||
serializer_class = DocumentSerializer
|
serializer_class = DocumentSerializer
|
||||||
queryset = Document.objects.all().order_by("-date_uploaded")
|
queryset = Document.objects.all()
|
||||||
pagination_class = PageNumberPagination
|
pagination_class = PageNumberPagination
|
||||||
permission_classes = [IsAuthenticated]
|
permission_classes = [IsAuthenticated]
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
{% trans 'Please visit the site to check your request:' %}
|
{% trans 'Please visit the site to check your request:' %}
|
||||||
{{ url|safe }}
|
{{ url|safe }}
|
||||||
|
|
||||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block html_body %}
|
{% block html_body %}
|
||||||
|
@ -27,6 +27,6 @@
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies. Hardcopy requests are valid only within 1 month of requesting.' %}
|
{% trans 'For hardcopy requests, please proceed to the USTP office to avail of your requested copies:' %}
|
||||||
</p>
|
</p>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
Loading…
Add table
Reference in a new issue