From 562868e7d8cf292d071885756909e2edc7ab9263 Mon Sep 17 00:00:00 2001 From: Bill Ballou Date: Mon, 2 Mar 2026 23:44:21 -0500 Subject: [PATCH] feat: add DOCX to text conversion helper script with tests --- .../tabular-extract/scripts/convert_docx.py | 83 ++++++++++++ .../scripts/test_convert_docx.py | 122 ++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100755 skills/tabular-extract/scripts/convert_docx.py create mode 100644 skills/tabular-extract/scripts/test_convert_docx.py diff --git a/skills/tabular-extract/scripts/convert_docx.py b/skills/tabular-extract/scripts/convert_docx.py new file mode 100755 index 0000000..50efd9e --- /dev/null +++ b/skills/tabular-extract/scripts/convert_docx.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Convert a DOCX file to plain text. + +Extracts text from paragraphs and tables. +Outputs to stdout for piping into other tools. + +Usage: + convert_docx.py + +Requires: + pip install python-docx +""" + +import sys +import os + + +def convert_docx(filepath: str) -> str: + """Extract text from a DOCX file, including paragraphs and tables.""" + try: + from docx import Document + except ImportError: + print( + "Error: python-docx is not installed.\n" + "Install it with: pip install python-docx", + file=sys.stderr + ) + sys.exit(1) + + doc = Document(filepath) + parts = [] + + for element in doc.element.body: + tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag + + if tag == "p": + # Paragraph + for para in doc.paragraphs: + if para._element is element: + text = para.text.strip() + if text: + parts.append(text) + break + + elif tag == "tbl": + # Table + for table in doc.tables: + if table._element is element: + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + parts.append(" | ".join(cells)) + parts.append("") # blank line after table + break + + return "\n".join(parts) + + +def main(): + if len(sys.argv) != 2: + print("Usage: convert_docx.py ", file=sys.stderr) + sys.exit(1) + + filepath = sys.argv[1] + + if not os.path.exists(filepath): + print(f"Error: File not found: {filepath}", file=sys.stderr) + sys.exit(1) + + if not filepath.lower().endswith(".docx"): + print(f"Error: Not a .docx file: {filepath}", file=sys.stderr) + sys.exit(1) + + try: + text = convert_docx(filepath) + print(text) + except Exception as e: + print(f"Error converting file: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/tabular-extract/scripts/test_convert_docx.py b/skills/tabular-extract/scripts/test_convert_docx.py new file mode 100644 index 0000000..10d35af --- /dev/null +++ b/skills/tabular-extract/scripts/test_convert_docx.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Tests for convert_docx.py""" + +import subprocess +import sys +import tempfile +import os + +SCRIPT = os.path.join(os.path.dirname(__file__), "convert_docx.py") + + +def test_missing_argument(): + """Script should print usage and exit 1 when no args given.""" + result = subprocess.run( + [sys.executable, SCRIPT], + capture_output=True, text=True + ) + assert result.returncode == 1 + assert "Usage:" in result.stderr + + +def test_nonexistent_file(): + """Script should error on a file that doesn't exist.""" + result = subprocess.run( + [sys.executable, SCRIPT, "/tmp/nonexistent_file_abc123.docx"], + capture_output=True, text=True + ) + assert result.returncode == 1 + assert "Error" in result.stderr or "not found" in result.stderr.lower() + + +def test_non_docx_file(): + """Script should error on a non-DOCX file.""" + with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f: + f.write(b"hello world") + f.flush() + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + assert result.returncode == 1 + + +def test_valid_docx(): + """Script should extract text from a valid DOCX file.""" + try: + from docx import Document + except ImportError: + print("SKIP: python-docx not installed") + return + + doc = Document() + doc.add_paragraph("Hello from test document") + doc.add_paragraph("Second paragraph here") + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + doc.save(f.name) + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + + assert result.returncode == 0 + assert "Hello from test document" in result.stdout + assert "Second paragraph here" in result.stdout + + +def test_docx_with_table(): + """Script should extract table content from a DOCX file.""" + try: + from docx import Document + except ImportError: + print("SKIP: python-docx not installed") + return + + doc = Document() + doc.add_paragraph("Before table") + table = doc.add_table(rows=2, cols=2) + table.cell(0, 0).text = "Header1" + table.cell(0, 1).text = "Header2" + table.cell(1, 0).text = "Value1" + table.cell(1, 1).text = "Value2" + doc.add_paragraph("After table") + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + doc.save(f.name) + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + + assert result.returncode == 0 + assert "Header1" in result.stdout + assert "Value1" in result.stdout + + +if __name__ == "__main__": + tests = [ + test_missing_argument, + test_nonexistent_file, + test_non_docx_file, + test_valid_docx, + test_docx_with_table, + ] + passed = 0 + failed = 0 + for test in tests: + try: + test() + print(f" PASS: {test.__name__}") + passed += 1 + except AssertionError as e: + print(f" FAIL: {test.__name__} - {e}") + failed += 1 + except Exception as e: + print(f" ERROR: {test.__name__} - {e}") + failed += 1 + print(f"\n{passed} passed, {failed} failed") + sys.exit(1 if failed else 0)