Skip to content

Commit

Permalink
Merge pull request #21 from skytin1004/update
Browse files Browse the repository at this point in the history
Fix issue with markdown translation where code blocks were split across chunks
  • Loading branch information
skytin1004 authored Sep 29, 2024
2 parents 1df321d + 5a9b223 commit ba135e8
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 8 deletions.
29 changes: 22 additions & 7 deletions src/co_op_translator/translators/markdown_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig
from co_op_translator.utils.markdown_utils import process_markdown, update_links, generate_prompt_template, count_links_in_markdown, process_markdown_with_many_links
from co_op_translator.utils.markdown_utils import (
process_markdown,
update_links,
generate_prompt_template,
count_links_in_markdown,
process_markdown_with_many_links,
replace_code_blocks_and_inline_code,
restore_code_blocks_and_inline_code
)
from co_op_translator.config.base_config import Config
from co_op_translator.config.font_config import FontConfig
import time
Expand Down Expand Up @@ -56,22 +64,29 @@ async def translate_markdown(self, document: str, language_code: str, md_file_pa
str: The translated content with updated links and a disclaimer appended.
"""
md_file_path = Path(md_file_path)
link_limit = 30

if count_links_in_markdown(document) > link_limit:
# Step 1: Replace code blocks and inline code with placeholders
document_with_placeholders, placeholder_map = replace_code_blocks_and_inline_code(document)

# Step 2: Split the document into chunks and generate prompts
link_limit = 30
if count_links_in_markdown(document_with_placeholders) > link_limit:
logger.info(f"Document contains more than {link_limit} links, splitting the document into chunks.")
document_chunks = process_markdown_with_many_links(document, link_limit)
document_chunks = process_markdown_with_many_links(document_with_placeholders, link_limit)
else:
logger.info(f"Document contains {link_limit} or fewer links, processing normally.")
document_chunks = process_markdown(document)
document_chunks = process_markdown(document_with_placeholders)

# Step 3: Generate translation prompts and translate each chunk
prompts = [generate_prompt_template(language_code, chunk, self.font_config.is_rtl(language_code)) for chunk in document_chunks]

results = await self._run_prompts_sequentially(prompts)
translated_content = "\n".join(results)

updated_content = update_links(md_file_path, translated_content, language_code, self.root_dir)
# Step 4: Restore the code blocks and inline code from placeholders
translated_content = restore_code_blocks_and_inline_code(translated_content, placeholder_map)

# Step 5: Update links and add disclaimer
updated_content = update_links(md_file_path, translated_content, language_code, self.root_dir)
disclaimer = await self.generate_disclaimer(language_code)
updated_content += "\n\n" + disclaimer

Expand Down
52 changes: 51 additions & 1 deletion src/co_op_translator/utils/markdown_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def generate_prompt_template(output_lang: str, document_chunk: str, is_rtl: bool
Make sure the translation does not sound too literal. Make sure you translate comments as well.
This file is written in Markdown format. Do not treat this as XML or HTML.
Do not translate any [!NOTE], [!WARNING], [!TIP], [!IMPORTANT], or [!CAUTION].
Do not translate any entities, such as variable names, function names, or class names, but keep them in the file.
Do not translate any entities, such as variable names, function names, class names, or placeholders like @@INLINE_CODE_x@@ or @@CODE_BLOCK_x@@, but keep them in the file.
Do not translate any urls or paths, but keep them in the file.
"""

Expand Down Expand Up @@ -328,3 +328,53 @@ def count_links_in_markdown(content: str) -> int:

link_pattern = re.compile(r"\[.*?\]\(.*?\)")
return len(link_pattern.findall(content))

def replace_code_blocks_and_inline_code(document: str):
"""
Replace code blocks and inline code in the document with placeholders.
Args:
document (str): The markdown document to process.
Returns:
tuple: A tuple containing:
- The document with placeholders.
- A dictionary mapping placeholders to their original code.
"""
code_block_pattern = r'```[\s\S]*?```'
inline_code_pattern = r'`[^`]+`'

# Replace code blocks
code_blocks = re.findall(code_block_pattern, document)
inline_codes = re.findall(inline_code_pattern, document)

placeholder_map = {}

# Replace code blocks with placeholders
for i, code_block in enumerate(code_blocks):
placeholder = f"@@CODE_BLOCK_{i}@@"
document = document.replace(code_block, placeholder)
placeholder_map[placeholder] = code_block

# Replace inline codes with placeholders
for i, inline_code in enumerate(inline_codes):
placeholder = f"@@INLINE_CODE_{i}@@"
document = document.replace(inline_code, placeholder)
placeholder_map[placeholder] = inline_code

return document, placeholder_map

def restore_code_blocks_and_inline_code(translated_document: str, placeholder_map: dict):
"""
Restore code blocks and inline code into the translated document from the placeholders.
Args:
translated_document (str): The translated document containing placeholders.
placeholder_map (dict): A dictionary mapping placeholders to their original code.
Returns:
str: The translated document with the original code blocks and inline code restored.
"""
for placeholder, code in placeholder_map.items():
translated_document = translated_document.replace(placeholder, code)
return translated_document

0 comments on commit ba135e8

Please sign in to comment.