Fixed warnings from pylint

This commit is contained in:
Jan Mrna
2025-11-06 10:58:18 +01:00
parent 7010edae44
commit ee8a8ad170
2 changed files with 45 additions and 22 deletions

33
db.py
View File

@@ -1,3 +1,7 @@
#pylint: disable=missing-class-docstring,invalid-name,broad-exception-caught
"""
Database module for semantic document search tool.
"""
import pickle
from pathlib import Path
from dataclasses import dataclass
@@ -6,13 +10,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import pymupdf
import ollama # TODO split to another file
import ollama
#
# Types
#
type Vector = np.NDArray # np.NDArray[np.float32] ?
type Vector = np.NDArray
type VectorBytes = bytes
@@ -90,12 +94,29 @@ def _vectorize_record(record: Record) -> tuple[Record, Vector]:
return record, _embed(record.text)
def test_embedding() -> bool:
"""
Test if embedding functionality is available and working.
Returns:
bool: True if embedding is working, False otherwise
"""
try:
_ = _embed("Test.")
return True
except Exception:
return False
#
# High-level (exported) functions
#
def create_dummy() -> Database:
"""
Create a dummy database for testing purposes.
"""
db_length: Final[int] = 10
vectors = [np.array([i, 2 * i, 3 * i, 4 * i]) for i in range(db_length)]
records = {
@@ -250,13 +271,11 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
records: list[Record] = []
chunk_size = 1024
for page_num in range(len(doc)):
page = doc[page_num]
for page_num, page in enumerate(doc):
text = page.get_text().strip()
if not text:
print(f" Page {page_num + 1}: Skipped (empty)")
continue
# Simple chunking - split text into chunks of specified size
for chunk_idx, i in enumerate(range(0, len(text), chunk_size)):
chunk = text[i : i + chunk_size]
@@ -267,14 +286,14 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
)
doc.close()
except Exception as e:
raise RuntimeError(f"Error processing PDF {file}: {e}")
raise RuntimeError(f"Error processing PDF {file}: {e}") from e
# Process chunks in parallel
print(f"Processing {len(records)} chunks with {max_workers} workers...")
db.documents.append(file)
# TODO measure with GIL disabled to check if multithreading actually helps
# NOTE this will only help with GIL disabled
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = [pool.submit(_vectorize_record, r) for r in records]
for f in as_completed(futures):