Unstructured-IO
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 5 additions & 0 deletions b/‎Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/convert/elements_json_to_format.py‎
Lines changed: 99 additions & 0 deletions b/‎scripts/convert/elements_json_to_format.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎scripts/html/rendered_html_from_elements.py‎ renamed to ‎scripts/convert/rendered_html_from_elements.py‎ b/‎scripts/html/rendered_html_from_elements.py‎ renamed to ‎scripts/convert/rendered_html_from_elements.py‎
diff --git a/‎scripts/html/elements_json_to_html.py‎
Lines changed: 0 additions & 66 deletions b/‎scripts/html/elements_json_to_html.py‎
Lines changed: 0 additions & 66 deletions
diff --git a/‎scripts/user/unstructured-get-json.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/user/unstructured-get-json.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_unstructured/staging/test_base.py‎
Lines changed: 139 additions & 0 deletions b/‎test_unstructured/staging/test_base.py‎
Lines changed: 139 additions & 0 deletions
@@ -348,6 +348,34 @@ jobs:
         sudo apt-get install diffstat
         ./test_unstructured_ingest/check-diff-expected-output-html.sh
 
+  test_json_to_markdown:
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    runs-on: ubuntu-latest-m
+    needs: [setup, lint]
+    steps:
+    - uses: 'actions/checkout@v4'
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Get full Python version
+      id: full-python-version
+      run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
+    - name: Setup virtual environment
+      uses: ./.github/actions/base-cache
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Test markdown fixtures
+      env:
+        OVERWRITE_FIXTURES: "false"
+        PYTHONPATH: ${{ github.workspace }}
+      run: |
+        source .venv/bin/activate
+        sudo apt-get install diffstat
+        ./test_unstructured_ingest/check-diff-expected-output-markdown.sh
+
   test_unstructured_api_unit:
     strategy:
       matrix:
 
@@ -1,3 +1,12 @@
+## 0.18.9-dev0
+
+### Enhancements
+
+### Features
+- **Convert elements to markdown for output** Added function to convert elements to markdown format for easy viewing.
+
+### Fixes
+
 ## 0.18.8
 
 ### Enhancements
 
@@ -342,3 +342,8 @@ run-jupyter:
 html-fixtures-update:
 	rm -r test_unstructured_ingest/expected-structured-output-html && \
 	test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
+
+.PHONY: markdown-fixtures-update
+markdown-fixtures-update:
+	rm -r test_unstructured_ingest/expected-structured-output-markdown && \
+	test_unstructured_ingest/structured-json-to-markdown.sh test_unstructured_ingest/expected-structured-output-markdown
@@ -0,0 +1,99 @@
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from unstructured.partition.html.convert import elements_to_html
+from unstructured.staging.base import elements_from_json, elements_to_md
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def json_to_format(
+    filepath: Path,
+    outdir: Path,
+    format_type: str,
+    exclude_binary_image_data: bool,
+    no_group_by_page: bool,
+):
+    logger.info("Processing: %s", filepath)
+    elements = elements_from_json(str(filepath))
+
+    if format_type == "html":
+        output_content = elements_to_html(elements, exclude_binary_image_data, no_group_by_page)
+        file_extension = ".html"
+    elif format_type == "markdown":
+        output_content = elements_to_md(
+            elements, exclude_binary_image_data=exclude_binary_image_data
+        )
+        file_extension = ".md"
+    else:
+        raise ValueError(f"Unsupported format: {format_type}. Supported formats: html, markdown")
+
+    outpath = outdir / filepath.with_suffix(file_extension).name
+    os.makedirs(outpath.parent, exist_ok=True)
+    with open(outpath, "w+") as f:
+        f.write(output_content)
+    logger.info(f"{format_type.upper()} rendered and saved to: %s", outpath)
+
+
+def multiple_json_to_format(
+    path: Path,
+    outdir: Path,
+    format_type: str,
+    exclude_binary_image_data: bool,
+    no_group_by_page: bool,
+):
+    for root, _, files in os.walk(path):
+        for file in files:
+            if file.endswith(".json"):
+                json_file_path = Path(root) / file
+                outpath = outdir / json_file_path.relative_to(path).parent
+                json_to_format(
+                    json_file_path,
+                    outpath,
+                    format_type,
+                    exclude_binary_image_data,
+                    no_group_by_page,
+                )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert JSON elements to HTML or Markdown.")
+    parser.add_argument(
+        "filepath",
+        type=str,
+        help="""Path to the JSON file or directory containing elements.
+        If given directory it will convert all JSON files in directory
+        and all sub-directories.""",
+    )
+    parser.add_argument(
+        "--outdir", type=str, help="Output directory for the output file.", default=""
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        choices=["html", "markdown"],
+        default="html",
+        help="Output format: html or markdown (default: html)",
+    )
+    parser.add_argument(
+        "--exclude-img", action="store_true", help="Exclude binary image data from the output."
+    )
+    parser.add_argument(
+        "--no-group", action="store_true", help="Don't group elements by pages (HTML only)."
+    )
+    args = parser.parse_args()
+
+    filepath = Path(args.filepath)
+    outdir = Path(args.outdir)
+
+    if filepath.is_file():
+        json_to_format(filepath, outdir, args.format, args.exclude_img, args.no_group)
+    else:
+        multiple_json_to_format(filepath, outdir, args.format, args.exclude_img, args.no_group)
+
+
+if __name__ == "__main__":
+    main()
@@ -321,7 +321,7 @@ if [ "$WRITE_HTML" = true ]; then
     # most elements will not have metadata.text_as_html defined (by design on Table elements do),
     # so use the unstructured library's python script for the conversion.
     SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-    PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
+    PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../convert/elements_json_to_format.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
     echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}"
   fi
 
 
@@ -15,6 +15,7 @@
     CoordinatesMetadata,
     CoordinateSystem,
     DataSourceMetadata,
+    Element,
     ElementMetadata,
     ElementType,
     FigureCaption,
@@ -23,6 +24,7 @@
     ListItem,
     NarrativeText,
     PageBreak,
+    Table,
     Text,
     Title,
 )
@@ -513,3 +515,140 @@ def test_flatten_empty_dict():
 def test_flatten_dict_empty_lists():
     """Flattening a dictionary with empty lists"""
     assert base.flatten_dict({"a": [], "b": {"c": []}}) == {"a": [], "b_c": []}
+
+
+@pytest.mark.parametrize(
+    ("json_filename", "expected_md_filename"),
+    [
+        (
+            "test_unstructured/testfiles/staging/UDHR_first_article_all.txt.json",
+            "test_unstructured/testfiles/staging/UDHR_first_article_all.txt.md",
+        ),
+        (
+            "test_unstructured/testfiles/staging/embedded-images.pdf.json",
+            "test_unstructured/testfiles/staging/embedded-images.pdf.md",
+        ),
+    ],
+)
+def test_elements_to_md_conversion(json_filename: str, expected_md_filename: str):
+    """Test that elements_from_json followed by elements_to_md produces expected markdown output."""
+    # Rehydrate elements from JSON
+    elements = base.elements_from_json(json_filename)
+
+    # Convert to markdown
+    markdown_output = base.elements_to_md(elements)
+
+    # Read expected markdown fixture
+    with open(expected_md_filename) as f:
+        expected_markdown = f.read()
+
+    # Compare outputs
+    assert markdown_output == expected_markdown
+
+
+@pytest.mark.parametrize(
+    ("element", "expected_markdown", "exclude_binary"),
+    [
+        (Title("Test Title"), "# Test Title", False),
+        (NarrativeText("This is some narrative text."), "This is some narrative text.", False),
+        (
+            Image(
+                "Test Image",
+                metadata=ElementMetadata(
+                    image_base64=(
+                        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhf"
+                        "DwAChwGA60e6kgAAAABJRU5ErkJggg=="
+                    ),
+                    image_mime_type="image/png",
+                ),
+            ),
+            (
+                "![Test Image](data:image/png;base64,"
+                "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgA"
+                "AAABJRU5ErkJggg==)"
+            ),
+            False,
+        ),
+        (
+            Image(
+                "Test Image",
+                metadata=ElementMetadata(
+                    image_base64=(
+                        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60"
+                        "e6kgAAAABJRU5ErkJggg=="
+                    ),
+                    image_mime_type="image/png",
+                ),
+            ),
+            "Test Image",
+            True,
+        ),
+        (
+            Image(
+                "Test Image", metadata=ElementMetadata(image_url="https://example.com/image.jpg")
+            ),
+            "![Test Image](https://example.com/image.jpg)",
+            False,
+        ),
+        (
+            Table(
+                "Table Text",
+                metadata=ElementMetadata(text_as_html="<table><tr><td>Test</td></tr></table>"),
+            ),
+            "<table><tr><td>Test</td></tr></table>",
+            False,
+        ),
+        (Table("Table Text"), "Table Text", False),
+    ],
+)
+def test_element_to_md_conversion(element: "Element", expected_markdown: str, exclude_binary: bool):
+    """Test individual element to markdown conversion for different element types."""
+    assert (
+        base.element_to_md(element, exclude_binary_image_data=exclude_binary) == expected_markdown
+    )
+
+
+def test_elements_to_md_file_output():
+    """Test elements_to_md function with file output."""
+
+    elements = [Title("Test Title"), NarrativeText("Test content.")]
+
+    # Test file output
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
+        tmp_filename = tmp_file.name
+
+    try:
+        markdown_content = base.elements_to_md(elements, filename=tmp_filename)
+
+        # Check that the function returns the content
+        assert markdown_content == "# Test Title\nTest content."
+
+        # Check that the file was created with correct content
+        with open(tmp_filename) as f:
+            file_content = f.read()
+        assert file_content == "# Test Title\nTest content."
+
+    finally:
+        # Clean up
+        if os.path.exists(tmp_filename):
+            os.unlink(tmp_filename)
+
+
+def test_element_to_md_with_none_mime_type():
+    """Test element_to_md handles None mime_type gracefully."""
+    from unstructured.documents.elements import ElementMetadata, Image
+
+    # Test Image element with None mime_type
+    image_metadata = ElementMetadata(
+        image_base64=(
+            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU"
+            "5ErkJggg=="
+        ),
+        image_mime_type=None,
+    )
+    image_element = Image("Test Image", metadata=image_metadata)
+
+    # Should handle None mime_type gracefully
+    result = base.element_to_md(image_element)
+    assert "![Test Image](data:image/*" in result
+    assert "base64," in result