Skip to content

Commit c7c3e3c

Browse files
authored
feat: convert elements to markdown (#4055)
Creates a staging function `elements_to_md` to convert lists of `Elements` to markdown strings (or a markdown file). Includes unit tests as well as ingest tests and expected output fixtures.
1 parent f66562b commit c7c3e3c

File tree

189 files changed

+16484
-69
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

189 files changed

+16484
-69
lines changed

.github/workflows/ci.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,34 @@ jobs:
348348
sudo apt-get install diffstat
349349
./test_unstructured_ingest/check-diff-expected-output-html.sh
350350
351+
test_json_to_markdown:
352+
strategy:
353+
matrix:
354+
python-version: ["3.10"]
355+
runs-on: ubuntu-latest-m
356+
needs: [setup, lint]
357+
steps:
358+
- uses: 'actions/checkout@v4'
359+
- name: Set up Python ${{ matrix.python-version }}
360+
uses: actions/setup-python@v5
361+
with:
362+
python-version: ${{ matrix.python-version }}
363+
- name: Get full Python version
364+
id: full-python-version
365+
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
366+
- name: Setup virtual environment
367+
uses: ./.github/actions/base-cache
368+
with:
369+
python-version: ${{ matrix.python-version }}
370+
- name: Test markdown fixtures
371+
env:
372+
OVERWRITE_FIXTURES: "false"
373+
PYTHONPATH: ${{ github.workspace }}
374+
run: |
375+
source .venv/bin/activate
376+
sudo apt-get install diffstat
377+
./test_unstructured_ingest/check-diff-expected-output-markdown.sh
378+
351379
test_unstructured_api_unit:
352380
strategy:
353381
matrix:

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.9-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
- **Convert elements to markdown for output** Added function to convert elements to markdown format for easy viewing.
7+
8+
### Fixes
9+
110
## 0.18.8
211

312
### Enhancements

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,3 +342,8 @@ run-jupyter:
342342
html-fixtures-update:
343343
rm -r test_unstructured_ingest/expected-structured-output-html && \
344344
test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
345+
346+
.PHONY: markdown-fixtures-update
347+
markdown-fixtures-update:
348+
rm -r test_unstructured_ingest/expected-structured-output-markdown && \
349+
test_unstructured_ingest/structured-json-to-markdown.sh test_unstructured_ingest/expected-structured-output-markdown
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import argparse
2+
import logging
3+
import os
4+
from pathlib import Path
5+
6+
from unstructured.partition.html.convert import elements_to_html
7+
from unstructured.staging.base import elements_from_json, elements_to_md
8+
9+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def json_to_format(
14+
filepath: Path,
15+
outdir: Path,
16+
format_type: str,
17+
exclude_binary_image_data: bool,
18+
no_group_by_page: bool,
19+
):
20+
logger.info("Processing: %s", filepath)
21+
elements = elements_from_json(str(filepath))
22+
23+
if format_type == "html":
24+
output_content = elements_to_html(elements, exclude_binary_image_data, no_group_by_page)
25+
file_extension = ".html"
26+
elif format_type == "markdown":
27+
output_content = elements_to_md(
28+
elements, exclude_binary_image_data=exclude_binary_image_data
29+
)
30+
file_extension = ".md"
31+
else:
32+
raise ValueError(f"Unsupported format: {format_type}. Supported formats: html, markdown")
33+
34+
outpath = outdir / filepath.with_suffix(file_extension).name
35+
os.makedirs(outpath.parent, exist_ok=True)
36+
with open(outpath, "w+") as f:
37+
f.write(output_content)
38+
logger.info(f"{format_type.upper()} rendered and saved to: %s", outpath)
39+
40+
41+
def multiple_json_to_format(
42+
path: Path,
43+
outdir: Path,
44+
format_type: str,
45+
exclude_binary_image_data: bool,
46+
no_group_by_page: bool,
47+
):
48+
for root, _, files in os.walk(path):
49+
for file in files:
50+
if file.endswith(".json"):
51+
json_file_path = Path(root) / file
52+
outpath = outdir / json_file_path.relative_to(path).parent
53+
json_to_format(
54+
json_file_path,
55+
outpath,
56+
format_type,
57+
exclude_binary_image_data,
58+
no_group_by_page,
59+
)
60+
61+
62+
def main():
63+
parser = argparse.ArgumentParser(description="Convert JSON elements to HTML or Markdown.")
64+
parser.add_argument(
65+
"filepath",
66+
type=str,
67+
help="""Path to the JSON file or directory containing elements.
68+
If given directory it will convert all JSON files in directory
69+
and all sub-directories.""",
70+
)
71+
parser.add_argument(
72+
"--outdir", type=str, help="Output directory for the output file.", default=""
73+
)
74+
parser.add_argument(
75+
"--format",
76+
type=str,
77+
choices=["html", "markdown"],
78+
default="html",
79+
help="Output format: html or markdown (default: html)",
80+
)
81+
parser.add_argument(
82+
"--exclude-img", action="store_true", help="Exclude binary image data from the output."
83+
)
84+
parser.add_argument(
85+
"--no-group", action="store_true", help="Don't group elements by pages (HTML only)."
86+
)
87+
args = parser.parse_args()
88+
89+
filepath = Path(args.filepath)
90+
outdir = Path(args.outdir)
91+
92+
if filepath.is_file():
93+
json_to_format(filepath, outdir, args.format, args.exclude_img, args.no_group)
94+
else:
95+
multiple_json_to_format(filepath, outdir, args.format, args.exclude_img, args.no_group)
96+
97+
98+
if __name__ == "__main__":
99+
main()

scripts/html/elements_json_to_html.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

scripts/user/unstructured-get-json.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ if [ "$WRITE_HTML" = true ]; then
321321
# most elements will not have metadata.text_as_html defined (by design on Table elements do),
322322
# so use the unstructured library's python script for the conversion.
323323
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
324-
PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
324+
PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../convert/elements_json_to_format.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
325325
echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}"
326326
fi
327327

test_unstructured/staging/test_base.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
CoordinatesMetadata,
1616
CoordinateSystem,
1717
DataSourceMetadata,
18+
Element,
1819
ElementMetadata,
1920
ElementType,
2021
FigureCaption,
@@ -23,6 +24,7 @@
2324
ListItem,
2425
NarrativeText,
2526
PageBreak,
27+
Table,
2628
Text,
2729
Title,
2830
)
@@ -513,3 +515,140 @@ def test_flatten_empty_dict():
513515
def test_flatten_dict_empty_lists():
514516
"""Flattening a dictionary with empty lists"""
515517
assert base.flatten_dict({"a": [], "b": {"c": []}}) == {"a": [], "b_c": []}
518+
519+
520+
@pytest.mark.parametrize(
521+
("json_filename", "expected_md_filename"),
522+
[
523+
(
524+
"test_unstructured/testfiles/staging/UDHR_first_article_all.txt.json",
525+
"test_unstructured/testfiles/staging/UDHR_first_article_all.txt.md",
526+
),
527+
(
528+
"test_unstructured/testfiles/staging/embedded-images.pdf.json",
529+
"test_unstructured/testfiles/staging/embedded-images.pdf.md",
530+
),
531+
],
532+
)
533+
def test_elements_to_md_conversion(json_filename: str, expected_md_filename: str):
534+
"""Test that elements_from_json followed by elements_to_md produces expected markdown output."""
535+
# Rehydrate elements from JSON
536+
elements = base.elements_from_json(json_filename)
537+
538+
# Convert to markdown
539+
markdown_output = base.elements_to_md(elements)
540+
541+
# Read expected markdown fixture
542+
with open(expected_md_filename) as f:
543+
expected_markdown = f.read()
544+
545+
# Compare outputs
546+
assert markdown_output == expected_markdown
547+
548+
549+
@pytest.mark.parametrize(
550+
("element", "expected_markdown", "exclude_binary"),
551+
[
552+
(Title("Test Title"), "# Test Title", False),
553+
(NarrativeText("This is some narrative text."), "This is some narrative text.", False),
554+
(
555+
Image(
556+
"Test Image",
557+
metadata=ElementMetadata(
558+
image_base64=(
559+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhf"
560+
"DwAChwGA60e6kgAAAABJRU5ErkJggg=="
561+
),
562+
image_mime_type="image/png",
563+
),
564+
),
565+
(
566+
"![Test Image](data:image/png;base64,"
567+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgA"
568+
"AAABJRU5ErkJggg==)"
569+
),
570+
False,
571+
),
572+
(
573+
Image(
574+
"Test Image",
575+
metadata=ElementMetadata(
576+
image_base64=(
577+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60"
578+
"e6kgAAAABJRU5ErkJggg=="
579+
),
580+
image_mime_type="image/png",
581+
),
582+
),
583+
"Test Image",
584+
True,
585+
),
586+
(
587+
Image(
588+
"Test Image", metadata=ElementMetadata(image_url="https://example.com/image.jpg")
589+
),
590+
"![Test Image](https://example.com/image.jpg)",
591+
False,
592+
),
593+
(
594+
Table(
595+
"Table Text",
596+
metadata=ElementMetadata(text_as_html="<table><tr><td>Test</td></tr></table>"),
597+
),
598+
"<table><tr><td>Test</td></tr></table>",
599+
False,
600+
),
601+
(Table("Table Text"), "Table Text", False),
602+
],
603+
)
604+
def test_element_to_md_conversion(element: "Element", expected_markdown: str, exclude_binary: bool):
605+
"""Test individual element to markdown conversion for different element types."""
606+
assert (
607+
base.element_to_md(element, exclude_binary_image_data=exclude_binary) == expected_markdown
608+
)
609+
610+
611+
def test_elements_to_md_file_output():
612+
"""Test elements_to_md function with file output."""
613+
614+
elements = [Title("Test Title"), NarrativeText("Test content.")]
615+
616+
# Test file output
617+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
618+
tmp_filename = tmp_file.name
619+
620+
try:
621+
markdown_content = base.elements_to_md(elements, filename=tmp_filename)
622+
623+
# Check that the function returns the content
624+
assert markdown_content == "# Test Title\nTest content."
625+
626+
# Check that the file was created with correct content
627+
with open(tmp_filename) as f:
628+
file_content = f.read()
629+
assert file_content == "# Test Title\nTest content."
630+
631+
finally:
632+
# Clean up
633+
if os.path.exists(tmp_filename):
634+
os.unlink(tmp_filename)
635+
636+
637+
def test_element_to_md_with_none_mime_type():
638+
"""Test element_to_md handles None mime_type gracefully."""
639+
from unstructured.documents.elements import ElementMetadata, Image
640+
641+
# Test Image element with None mime_type
642+
image_metadata = ElementMetadata(
643+
image_base64=(
644+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU"
645+
"5ErkJggg=="
646+
),
647+
image_mime_type=None,
648+
)
649+
image_element = Image("Test Image", metadata=image_metadata)
650+
651+
# Should handle None mime_type gracefully
652+
result = base.element_to_md(image_element)
653+
assert "![Test Image](data:image/*" in result
654+
assert "base64," in result

0 commit comments

Comments
 (0)