Serve by file index, not full path

This commit is contained in:
Jan Mrna
2025-11-06 10:45:42 +01:00
parent e624db1ce7
commit 788eebc916
3 changed files with 29 additions and 16 deletions

23
db.py
View File

@@ -26,7 +26,7 @@ class Record:
class QueryResult:
record: Record
distance: float
document: Path
document_name: str
@dataclass(slots=True)
class Database:
@@ -197,7 +197,7 @@ def query(db: Database | Path, text: str, record_count: int = 10) -> list[QueryR
# Look up the corresponding record
if vector_bytes in db.records:
record = db.records[vector_bytes]
results.append(QueryResult(record, distance, db.documents[record.document_index]))
results.append(QueryResult(record, distance, db.documents[record.document_index].name))
return results
@@ -273,3 +273,22 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
if save_to_file and database_file_path:
save(db, database_file_path)
print(f"Database saved to {database_file_path}")
def get_document_path(db: Database | Path, document_index: int) -> Path:
"""
Get the file path of the document at the given index in the database.
Args:
db: Database object or path to database file
document_index: Index of the document to retrieve
Returns:
Path to the document file
"""
if isinstance(db, Path):
db = load(db)
if document_index < 0 or document_index >= len(db.documents):
raise IndexError(f"Document index out of range: {document_index}")
return db.documents[document_index]

18
main.py
View File

@@ -166,7 +166,7 @@ def query(db_path: str, query_text: str):
for i, res in enumerate(results, 1):
print(f"\n{i}. Distance: {res.distance:.4f}")
print(f" Document: {res.document.name}")
print(f" Document: {res.document_name}")
print(f" Page: {res.record.page}, Chunk: {res.record.chunk}")
# Replace all whitespace characters with regular spaces for cleaner display
clean_text = ' '.join(res.record.text[:200].split())
@@ -201,18 +201,13 @@ def start_web_server(db_path: str, host: str = "127.0.0.1", port: int = 5000):
def index():
return render_template("index.html", results=None)
@app.route('/file/<path:document_path>')
def serve_file(document_path):
@app.route('/file/<int:document_index>')
def serve_file(document_index):
"""Serve PDF files directly."""
try:
file_path = Path(document_path)
file_path = db.get_document_path(db_file, document_index)
if not file_path.exists():
return jsonify({'error': 'File not found'}), 404
# Check if it's a PDF file for security
if file_path.suffix.lower() != '.pdf':
return jsonify({'error': 'Only PDF files are allowed'}), 403
return send_file(file_path, as_attachment=False)
except Exception as e:
return jsonify({'error': str(e)}), 500
@@ -236,13 +231,12 @@ def start_web_server(db_path: str, host: str = "127.0.0.1", port: int = 5000):
for res in results:
formatted_results.append({
'distance': float(res.distance),
'document': res.document.name,
'document_path': str(res.document), # Full path for the link
'document_name': res.document_name,
'document_index': res.record.document_index,
'page': res.record.page,
'chunk': res.record.chunk,
'text': ' '.join(res.record.text[:300].split()) # Clean and truncate text
})
return jsonify({'results': formatted_results})
except Exception as e:

View File

@@ -58,7 +58,7 @@
resultsDiv.innerHTML = data.results.map((result, i) => `
<div class="result">
<div class="result-header">
Result ${i + 1} - <a href="/file/${encodeURIComponent(result.document_path)}#page=${result.page}" class="document-link" target="_blank">${result.document}</a>
Result ${i + 1} - <a href="/file/${encodeURIComponent(result.document_index)}#page=${result.page}" class="document-link" target="_blank">${result.document_name}</a>
<span class="distance">(Distance: ${result.distance.toFixed(4)})</span>
</div>
<div>Page: ${result.page}, Chunk: ${result.chunk}</div>