Skip to content

Commit

Permalink
feat: add basic text for thinking in chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Nov 28, 2023
1 parent 3b8be15 commit 2965c9d
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 18 deletions.
2 changes: 1 addition & 1 deletion local_server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
- [ ] Watch local directory for document changes
- [ ] Embedding the document, code
- [ ] Markdown Documentation Splitter: TreeSitter md
- [ ] Office, like Word Document Splitter: [docx-rs](https://github.com/bokuweb/docx-rs)
- [x] Office, like Word Document Splitter: [docx-rs](https://github.com/bokuweb/docx-rs)
- [x] reader
spike: [Document File Text Extractor](https://github.com/anvie/dotext), [docx](https://github.com/PoiScript/docx-rs), [OOXML](https://github.com/zitsen/ooxml-rs),
- [ ] Code Splitter: TreeSitter,
Expand Down
Binary file added local_server/_fixtures/header.docx
Binary file not shown.
71 changes: 55 additions & 16 deletions local_server/src/doc_split/word_splitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,43 +12,58 @@ pub struct WordSplitter {}

impl Splitter for WordSplitter {
fn split(path: &PathBuf) -> Vec<Document> {
let mut documents: Vec<Document> = vec![];
let document = Self::docx_to_markdown(path);

documents
}
}

impl WordSplitter {
fn docx_to_markdown(path: &PathBuf) -> String {
let mut file = File::open(path).unwrap();
let mut buf = vec![];
file.read_to_end(&mut buf).unwrap();

let mut documents: Vec<Document> = vec![];

let mut text = String::new(); // Declare as mutable String
let mut text = String::new();

match read_docx(&*buf) {
Ok(content) => {
content.document.children.iter().for_each(|child| {
match child {
DocumentChild::Paragraph(para) => {
let heading = match &para.property.style {
None => "",
Some(style) => {
match style.val.as_str() {
"Heading1" => "# ",
"Heading2" => "## ",
"Heading3" => "### ",
"Heading4" => "#### ",
"Heading5" => "##### ",
"Heading6" => "###### ",
_ => ""
}
}
};

let mut para_text = String::new();
para.children.iter().for_each(|child| {
match child {
ParagraphChild::Run(run) => {
text += &run.children.iter().map(|child| {
para_text += &run.children.iter().map(|child| {
match child {
RunChild::Text(text) => text.text.clone(),
_ => String::new(),
}
}).collect::<String>();
}
ParagraphChild::Insert(_) => {}
ParagraphChild::Delete(_) => {}
ParagraphChild::BookmarkStart(_) => {}
ParagraphChild::Hyperlink(_) => {}
ParagraphChild::BookmarkEnd(_) => {}
ParagraphChild::CommentStart(_) => {}
ParagraphChild::CommentEnd(_) => {}
ParagraphChild::StructuredDataTag(_) => {}
_ => {}
}
});

text += "\n\n";
text = format!("{}{}{}\n", text, heading, para_text);
}
DocumentChild::Table(_) => {}
_ => {}
}
});
Expand All @@ -58,8 +73,32 @@ impl Splitter for WordSplitter {
}
}

println!("text: {}", text);
text
}
}

documents
#[cfg(test)]
mod tests {
use std::path::PathBuf;

use crate::doc_split::word_splitter::WordSplitter;
use crate::infra::file_walker::FileWalker;

#[test]
fn test_word_splitter() {
let testdir = PathBuf::from("_fixtures").join("header.docx");
let files = FileWalker::index_directory(testdir);

let file = files.first().unwrap();
let documents = WordSplitter::docx_to_markdown(file);

assert_eq!(documents, "# Heading 1
## Heading 2
Normal Context
");
}
}
2 changes: 1 addition & 1 deletion local_server/src/document_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::app_state::AppState;
#[post("/tickets/{id}")]
async fn create_embedding_document(
req: web::Json<ReqDocument>,
data: web::Data<AppState>,
_data: web::Data<AppState>,
) -> impl Responder {
let response = serde_json::to_string(&req).unwrap();

Expand Down

0 comments on commit 2965c9d

Please sign in to comment.