diff --git a/local_server/README.md b/local_server/README.md index 970c813..ab36b42 100644 --- a/local_server/README.md +++ b/local_server/README.md @@ -3,7 +3,7 @@ - [ ] Watch local directory for document changes - [x] use Local directory - [ ] Embedding the document, code - - [ ] Markdown Documentation Splitter: TreeSitter md + - [ ] Markdown Documentation Splitter: [TreeSitter-markdown](https://github.com/MDeiml/tree-sitter-markdown) - [x] Office, like Word Document Splitter: [docx-rs](https://github.com/bokuweb/docx-rs) - [x] reader spike: [Document File Text Extractor](https://github.com/anvie/dotext), [docx](https://github.com/PoiScript/docx-rs), [OOXML](https://github.com/zitsen/ooxml-rs), @@ -14,9 +14,10 @@ like: [scraper](https://github.com/BloopAI/bloop/tree/main/server/bleep/src/scraper) - [ ] Document version control - [x] Vector Search: InMemory - - [FANN: Vector Search in 200 Lines of Rust](https://fennel.ai/blog/vector-search-in-200-lines-of-rust/) + - [FANN](https://github.com/fennel-ai/fann) - [FANN: Vector Search in 200 Lines of Rust](https://fennel.ai/blog/vector-search-in-200-lines-of-rust/) - [tinyvector](https://github.com/m1guelpf/tinyvector) - [x] Search document by semantic +- [ ] Embedding Search engine by [tantivy](https://github.com/quickwit-oss/tantivy) ## HTTP API design diff --git a/local_server/src/doc_split/office_splitter.rs b/local_server/src/doc_split/office_splitter.rs index 642352c..7c07b76 100644 --- a/local_server/src/doc_split/office_splitter.rs +++ b/local_server/src/doc_split/office_splitter.rs @@ -1,10 +1,11 @@ +use std::collections::HashMap; use std::fs::File; use std::io::Read; use std::path::PathBuf; use std::ptr; use docx_rs::{DocumentChild, ParagraphChild, read_docx, RunChild}; -use inference_core::Document; +use inference_core::{Document, Metadata}; use tracing::error; use unicode_segmentation::UnicodeSegmentation; @@ -15,7 +16,15 @@ pub struct OfficeSplitter {} impl Splitter for OfficeSplitter { fn split(path: &PathBuf, options: &SplitOptions) -> Vec { let mut documents: Vec = vec![]; - let document = Self::docx_to_markdown(path); + let document = Self::docx_to_markdown(path).expect("docx_to_markdown error"); + let pure_file_name = path.file_stem().unwrap().to_str().unwrap(); + let mut map = HashMap::new(); + map.insert("file_name".to_string(), pure_file_name.to_string()); + map.insert("file_path".to_string(), path.to_str().unwrap().to_string()); + + let metadata: Metadata = Metadata { + metadata: map, + }; let buf_size = options.chunk_size * 4; let mut buffer = String::with_capacity(buf_size); @@ -23,7 +32,7 @@ impl Splitter for OfficeSplitter { if buffer.len() + word.len() <= buf_size { buffer.push_str(word); } else { - documents.push(Document::from(buffer.clone())); + documents.push(Document::from_with_metadata(buffer.clone(), metadata.clone())); for i in buffer.len() .. buf_size { unsafe{ ptr::write(buffer.as_mut_ptr().add(i), 0x20); }; } @@ -36,10 +45,11 @@ impl Splitter for OfficeSplitter { } impl OfficeSplitter { - fn docx_to_markdown(path: &PathBuf) -> String { - let mut file = File::open(path).unwrap(); + fn docx_to_markdown(path: &PathBuf) -> Result { + let mut file = File::open(path)?; + let mut buf = vec![]; - file.read_to_end(&mut buf).unwrap(); + file.read_to_end(&mut buf)?; let mut text = String::new(); @@ -89,7 +99,7 @@ impl OfficeSplitter { } } - text + Ok(text) } } diff --git a/local_server/src/document_handler.rs b/local_server/src/document_handler.rs index 82bb1ba..f311d21 100644 --- a/local_server/src/document_handler.rs +++ b/local_server/src/document_handler.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use actix_web::{get, HttpResponse, post, Responder, web}; use actix_web::http::header::ContentType; use serde::{Deserialize, Serialize}; @@ -30,6 +31,7 @@ async fn search_embedding_document( id: doc.embedding_id, score: doc.score, text: doc.embedded.text, + metadata: doc.embedded.metadata.metadata, }) .collect(); @@ -45,6 +47,7 @@ pub struct DocumentResult { pub id: String, pub score: f32, pub text: String, + pub metadata: HashMap, } #[derive(Serialize, Deserialize)]