From b9636dbd573e298b482e3b97c35b191eed03fcdc Mon Sep 17 00:00:00 2001 From: Jan Mrna Date: Thu, 6 Nov 2025 10:45:42 +0100 Subject: [PATCH] Serve by file index, not full path --- db.py | 25 ++++++++++++++++++++++--- main.py | 18 ++++++------------ templates/index.html | 2 +- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/db.py b/db.py index 1d50c1e..b7aec77 100644 --- a/db.py +++ b/db.py @@ -26,7 +26,7 @@ class Record: class QueryResult: record: Record distance: float - document: Path + document_name: str @dataclass(slots=True) class Database: @@ -197,7 +197,7 @@ def query(db: Database | Path, text: str, record_count: int = 10) -> list[QueryR # Look up the corresponding record if vector_bytes in db.records: record = db.records[vector_bytes] - results.append(QueryResult(record, distance, db.documents[record.document_index])) + results.append(QueryResult(record, distance, db.documents[record.document_index].name)) return results @@ -272,4 +272,23 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None: # Save database if we loaded it from file if save_to_file and database_file_path: save(db, database_file_path) - print(f"Database saved to {database_file_path}") \ No newline at end of file + print(f"Database saved to {database_file_path}") + +def get_document_path(db: Database | Path, document_index: int) -> Path: + """ + Get the file path of the document at the given index in the database. + + Args: + db: Database object or path to database file + document_index: Index of the document to retrieve + + Returns: + Path to the document file + """ + if isinstance(db, Path): + db = load(db) + + if document_index < 0 or document_index >= len(db.documents): + raise IndexError(f"Document index out of range: {document_index}") + + return db.documents[document_index] \ No newline at end of file diff --git a/main.py b/main.py index 30106e5..9365d59 100644 --- a/main.py +++ b/main.py @@ -166,7 +166,7 @@ def query(db_path: str, query_text: str): for i, res in enumerate(results, 1): print(f"\n{i}. Distance: {res.distance:.4f}") - print(f" Document: {res.document.name}") + print(f" Document: {res.document_name}") print(f" Page: {res.record.page}, Chunk: {res.record.chunk}") # Replace all whitespace characters with regular spaces for cleaner display clean_text = ' '.join(res.record.text[:200].split()) @@ -201,18 +201,13 @@ def start_web_server(db_path: str, host: str = "127.0.0.1", port: int = 5000): def index(): return render_template("index.html", results=None) - @app.route('/file/') - def serve_file(document_path): + @app.route('/file/') + def serve_file(document_index): """Serve PDF files directly.""" try: - file_path = Path(document_path) + file_path = db.get_document_path(db_file, document_index) if not file_path.exists(): return jsonify({'error': 'File not found'}), 404 - - # Check if it's a PDF file for security - if file_path.suffix.lower() != '.pdf': - return jsonify({'error': 'Only PDF files are allowed'}), 403 - return send_file(file_path, as_attachment=False) except Exception as e: return jsonify({'error': str(e)}), 500 @@ -236,13 +231,12 @@ def start_web_server(db_path: str, host: str = "127.0.0.1", port: int = 5000): for res in results: formatted_results.append({ 'distance': float(res.distance), - 'document': res.document.name, - 'document_path': str(res.document), # Full path for the link + 'document_name': res.document_name, + 'document_index': res.record.document_index, 'page': res.record.page, 'chunk': res.record.chunk, 'text': ' '.join(res.record.text[:300].split()) # Clean and truncate text }) - return jsonify({'results': formatted_results}) except Exception as e: diff --git a/templates/index.html b/templates/index.html index 8722775..0becddb 100644 --- a/templates/index.html +++ b/templates/index.html @@ -58,7 +58,7 @@ resultsDiv.innerHTML = data.results.map((result, i) => `
- Result ${i + 1} - ${result.document} + Result ${i + 1} - ${result.document_name} (Distance: ${result.distance.toFixed(4)})
Page: ${result.page}, Chunk: ${result.chunk}