Skip to content

Commit

Permalink
feat(server): add metadata for semantic api result
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Nov 29, 2023
1 parent f4c6e53 commit d14de66
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 9 deletions.
5 changes: 3 additions & 2 deletions local_server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
- [ ] Watch local directory for document changes
- [x] use Local directory
- [ ] Embedding the document, code
- [ ] Markdown Documentation Splitter: TreeSitter md
- [ ] Markdown Documentation Splitter: [TreeSitter-markdown](https://github.com/MDeiml/tree-sitter-markdown)
- [x] Office, like Word Document Splitter: [docx-rs](https://github.com/bokuweb/docx-rs)
- [x] reader
spike: [Document File Text Extractor](https://github.com/anvie/dotext), [docx](https://github.com/PoiScript/docx-rs), [OOXML](https://github.com/zitsen/ooxml-rs),
Expand All @@ -14,9 +14,10 @@
like: [scraper](https://github.com/BloopAI/bloop/tree/main/server/bleep/src/scraper)
- [ ] Document version control
- [x] Vector Search: InMemory
- [FANN: Vector Search in 200 Lines of Rust](https://fennel.ai/blog/vector-search-in-200-lines-of-rust/)
- [FANN](https://github.com/fennel-ai/fann) - [FANN: Vector Search in 200 Lines of Rust](https://fennel.ai/blog/vector-search-in-200-lines-of-rust/)
- [tinyvector](https://github.com/m1guelpf/tinyvector)
- [x] Search document by semantic
- [ ] Embedding Search engine by [tantivy](https://github.com/quickwit-oss/tantivy)

## HTTP API design

Expand Down
24 changes: 17 additions & 7 deletions local_server/src/doc_split/office_splitter.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::path::PathBuf;
use std::ptr;

use docx_rs::{DocumentChild, ParagraphChild, read_docx, RunChild};
use inference_core::Document;
use inference_core::{Document, Metadata};
use tracing::error;
use unicode_segmentation::UnicodeSegmentation;

Expand All @@ -15,15 +16,23 @@ pub struct OfficeSplitter {}
impl Splitter for OfficeSplitter {
fn split(path: &PathBuf, options: &SplitOptions) -> Vec<Document> {
let mut documents: Vec<Document> = vec![];
let document = Self::docx_to_markdown(path);
let document = Self::docx_to_markdown(path).expect("docx_to_markdown error");
let pure_file_name = path.file_stem().unwrap().to_str().unwrap();
let mut map = HashMap::new();
map.insert("file_name".to_string(), pure_file_name.to_string());
map.insert("file_path".to_string(), path.to_str().unwrap().to_string());

let metadata: Metadata = Metadata {
metadata: map,
};

let buf_size = options.chunk_size * 4;
let mut buffer = String::with_capacity(buf_size);
for word in document.split_sentence_bounds() {
if buffer.len() + word.len() <= buf_size {
buffer.push_str(word);
} else {
documents.push(Document::from(buffer.clone()));
documents.push(Document::from_with_metadata(buffer.clone(), metadata.clone()));
for i in buffer.len() .. buf_size {
unsafe{ ptr::write(buffer.as_mut_ptr().add(i), 0x20); };
}
Expand All @@ -36,10 +45,11 @@ impl Splitter for OfficeSplitter {
}

impl OfficeSplitter {
fn docx_to_markdown(path: &PathBuf) -> String {
let mut file = File::open(path).unwrap();
fn docx_to_markdown(path: &PathBuf) -> Result<String, anyhow::Error> {
let mut file = File::open(path)?;

let mut buf = vec![];
file.read_to_end(&mut buf).unwrap();
file.read_to_end(&mut buf)?;

let mut text = String::new();

Expand Down Expand Up @@ -89,7 +99,7 @@ impl OfficeSplitter {
}
}

text
Ok(text)
}
}

Expand Down
3 changes: 3 additions & 0 deletions local_server/src/document_handler.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::collections::HashMap;
use actix_web::{get, HttpResponse, post, Responder, web};
use actix_web::http::header::ContentType;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -30,6 +31,7 @@ async fn search_embedding_document(
id: doc.embedding_id,
score: doc.score,
text: doc.embedded.text,
metadata: doc.embedded.metadata.metadata,
})
.collect();

Expand All @@ -45,6 +47,7 @@ pub struct DocumentResult {
pub id: String,
pub score: f32,
pub text: String,
pub metadata: HashMap<String, String>,
}

#[derive(Serialize, Deserialize)]
Expand Down

0 comments on commit d14de66

Please sign in to comment.