gergelykalman · josh-abram · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 9, 2024
diff --git a/README.md b/README.md
@@ -1,13 +1,59 @@
-### A very simple Confluence to Markdown exporter.
+# Confluence to Markdown Exporter
 
-This code is not written with security in mind, do NOT run it on a repository that can contain mailicious
-page titles.
+A simple tool to export Confluence pages to Markdown format. **Note:** This code is not written with security in mind. Do **not** run it on a repository that can contain malicious page titles.
 
+## Installation
 
-### Usage
-1. Install requirements: <code>pip3 install -r requirements.txt</code>
-2. Run the script: <code>python3.9 confluence-markdown-export.py url username token out_dir</code>
-   providing URL e.g. https://YOUR_PROJECT.atlassian.net, login details - username and API Token,
-   and output directory, e.g. ./output_dir
+1. Clone the repository:
+   ```sh
+   git clone https://github.com/josh-abram/confluence-markdown-exporter.git
+   cd confluence-markdown-exporter
+   ```
 
-The secret token can be generated under Profile -> Security -> Manage API Tokens
+2. Install the required packages:
+   ```sh
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+Run the script with the following command:
+
+```sh
+python3 confluence-markdown-export.py <url> <username> <token> <out_dir> [--space <space>] [--skip-attachments] [--no-fetch]
+```
+
+### Arguments
+
+- **url**: The URL of the Confluence instance (e.g., `https://YOUR_PROJECT.atlassian.net`).
+- **username**: Your Confluence username.
+- **token**: Your Confluence API token.
+- **out_dir**: The directory to output the files to.
+
+### Optional Arguments
+
+- **--space**: Specify the space key to export. If not provided, all spaces will be exported.
+- **--skip-attachments**: Skip fetching attachments.
+- **--no-fetch**: Only run the Markdown conversion without fetching data from Confluence.
+
+### Example
+
+```sh
+python3 confluence-markdown-export.py https://YOUR_PROJECT.atlassian.net your_username your_api_token ./output_dir --space DOCS --skip-attachments
+```
+
+## Generating an API Token
+
+1. Go to your Confluence profile.
+2. Navigate to **Security**.
+3. Under **API Tokens**, click **Manage API Tokens**.
+4. Generate a new token and use it as the `token` argument.
+
+## Notes
+
+- The script currently supports single-threaded operation, which is safe for higher chunk sizes during downloads.
+- Ensure you have the necessary permissions to access the Confluence instance and export data.
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
diff --git a/confluence-markdown-export.py b/confluence-markdown-export.py
@@ -29,14 +29,19 @@ def __init__(self, url, username, token, out_dir, space, no_attach):
         self.__no_attach = no_attach
         self.__space = space
 
-    def __sanitize_filename(self, document_name_raw):
+    def __sanitize_filename(self, document_name_raw, max_length=200):
         document_name = document_name_raw
-        for invalid in ["..", "/"]:
+        for invalid in ["..", "/", "\\", ":", "*", "?", '"', "<", ">", "|"]:
             if invalid in document_name:
-                print("Dangerous page title: \"{}\", \"{}\" found, replacing it with \"_\"".format(
-                    document_name,
-                    invalid))
+                print(f"Invalid character '{invalid}' found in '{document_name}', replacing it with '_'")
                 document_name = document_name.replace(invalid, "_")
+
+        # Truncate the filename if it's too long
+        if len(document_name) > max_length:
+            name, ext = os.path.splitext(document_name)
+            truncated_name = name[:max_length - len(ext) - 3] + "..."
+            document_name = truncated_name + ext
+
         return document_name
 
     def __dump_page(self, src_id, parents):
@@ -62,7 +67,7 @@ def __dump_page(self, src_id, parents):
 
         # make some rudimentary checks, to prevent trivial errors
         sanitized_filename = self.__sanitize_filename(document_name) + extension
-        sanitized_parents = list(map(self.__sanitize_filename, parents))
+        sanitized_parents = [self.__sanitize_filename(parent) for parent in parents]
 
         page_location = sanitized_parents + [sanitized_filename]
         page_filename = os.path.join(self.__out_dir, *page_location)
@@ -86,7 +91,6 @@ def __dump_page(self, src_id, parents):
                 )
                 att_sanitized_name = self.__sanitize_filename(att_title)
                 att_filename = os.path.join(page_output_dir, ATTACHMENT_FOLDER_NAME, att_sanitized_name)
-
                 att_dirname = os.path.dirname(att_filename)
                 os.makedirs(att_dirname, exist_ok=True)
 
@@ -124,7 +128,7 @@ def __dump_space(self, space):
             homepage_id = space["homepage"]["id"]
             self.__dump_page(homepage_id, parents=[space_key])
 
-    
+
     def dump(self):
         ret = self.__confluence.get_all_spaces(start=0, limit=500, expand='description.plain,homepage')
         if ret['size'] == 0: