From 562868e7d8cf292d071885756909e2edc7ab9263 Mon Sep 17 00:00:00 2001
From: Bill Ballou <bill@bballou.com>
Date: Mon, 2 Mar 2026 23:44:21 -0500
Subject: [PATCH] feat: add DOCX to text conversion helper script with tests

---
 .../tabular-extract/scripts/convert_docx.py   |  83 ++++++++++++
 .../scripts/test_convert_docx.py              | 122 ++++++++++++++++++
 2 files changed, 205 insertions(+)
 create mode 100755 skills/tabular-extract/scripts/convert_docx.py
 create mode 100644 skills/tabular-extract/scripts/test_convert_docx.py

diff --git a/skills/tabular-extract/scripts/convert_docx.py b/skills/tabular-extract/scripts/convert_docx.py
new file mode 100755
index 0000000..50efd9e
--- /dev/null
+++ b/skills/tabular-extract/scripts/convert_docx.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Convert a DOCX file to plain text.
+
+Extracts text from paragraphs and tables.
+Outputs to stdout for piping into other tools.
+
+Usage:
+    convert_docx.py <path-to-file.docx>
+
+Requires:
+    pip install python-docx
+"""
+
+import sys
+import os
+
+
+def convert_docx(filepath: str) -> str:
+    """Extract text from a DOCX file, including paragraphs and tables."""
+    try:
+        from docx import Document
+    except ImportError:
+        print(
+            "Error: python-docx is not installed.\n"
+            "Install it with: pip install python-docx",
+            file=sys.stderr
+        )
+        sys.exit(1)
+
+    doc = Document(filepath)
+    parts = []
+
+    for element in doc.element.body:
+        tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
+
+        if tag == "p":
+            # Paragraph
+            for para in doc.paragraphs:
+                if para._element is element:
+                    text = para.text.strip()
+                    if text:
+                        parts.append(text)
+                    break
+
+        elif tag == "tbl":
+            # Table
+            for table in doc.tables:
+                if table._element is element:
+                    for row in table.rows:
+                        cells = [cell.text.strip() for cell in row.cells]
+                        parts.append(" | ".join(cells))
+                    parts.append("")  # blank line after table
+                    break
+
+    return "\n".join(parts)
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
+        sys.exit(1)
+
+    filepath = sys.argv[1]
+
+    if not os.path.exists(filepath):
+        print(f"Error: File not found: {filepath}", file=sys.stderr)
+        sys.exit(1)
+
+    if not filepath.lower().endswith(".docx"):
+        print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        text = convert_docx(filepath)
+        print(text)
+    except Exception as e:
+        print(f"Error converting file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/tabular-extract/scripts/test_convert_docx.py b/skills/tabular-extract/scripts/test_convert_docx.py
new file mode 100644
index 0000000..10d35af
--- /dev/null
+++ b/skills/tabular-extract/scripts/test_convert_docx.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""Tests for convert_docx.py"""
+
+import subprocess
+import sys
+import tempfile
+import os
+
+SCRIPT = os.path.join(os.path.dirname(__file__), "convert_docx.py")
+
+
+def test_missing_argument():
+    """Script should print usage and exit 1 when no args given."""
+    result = subprocess.run(
+        [sys.executable, SCRIPT],
+        capture_output=True, text=True
+    )
+    assert result.returncode == 1
+    assert "Usage:" in result.stderr
+
+
+def test_nonexistent_file():
+    """Script should error on a file that doesn't exist."""
+    result = subprocess.run(
+        [sys.executable, SCRIPT, "/tmp/nonexistent_file_abc123.docx"],
+        capture_output=True, text=True
+    )
+    assert result.returncode == 1
+    assert "Error" in result.stderr or "not found" in result.stderr.lower()
+
+
+def test_non_docx_file():
+    """Script should error on a non-DOCX file."""
+    with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
+        f.write(b"hello world")
+        f.flush()
+        result = subprocess.run(
+            [sys.executable, SCRIPT, f.name],
+            capture_output=True, text=True
+        )
+        os.unlink(f.name)
+    assert result.returncode == 1
+
+
+def test_valid_docx():
+    """Script should extract text from a valid DOCX file."""
+    try:
+        from docx import Document
+    except ImportError:
+        print("SKIP: python-docx not installed")
+        return
+
+    doc = Document()
+    doc.add_paragraph("Hello from test document")
+    doc.add_paragraph("Second paragraph here")
+
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
+        doc.save(f.name)
+        result = subprocess.run(
+            [sys.executable, SCRIPT, f.name],
+            capture_output=True, text=True
+        )
+        os.unlink(f.name)
+
+    assert result.returncode == 0
+    assert "Hello from test document" in result.stdout
+    assert "Second paragraph here" in result.stdout
+
+
+def test_docx_with_table():
+    """Script should extract table content from a DOCX file."""
+    try:
+        from docx import Document
+    except ImportError:
+        print("SKIP: python-docx not installed")
+        return
+
+    doc = Document()
+    doc.add_paragraph("Before table")
+    table = doc.add_table(rows=2, cols=2)
+    table.cell(0, 0).text = "Header1"
+    table.cell(0, 1).text = "Header2"
+    table.cell(1, 0).text = "Value1"
+    table.cell(1, 1).text = "Value2"
+    doc.add_paragraph("After table")
+
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
+        doc.save(f.name)
+        result = subprocess.run(
+            [sys.executable, SCRIPT, f.name],
+            capture_output=True, text=True
+        )
+        os.unlink(f.name)
+
+    assert result.returncode == 0
+    assert "Header1" in result.stdout
+    assert "Value1" in result.stdout
+
+
+if __name__ == "__main__":
+    tests = [
+        test_missing_argument,
+        test_nonexistent_file,
+        test_non_docx_file,
+        test_valid_docx,
+        test_docx_with_table,
+    ]
+    passed = 0
+    failed = 0
+    for test in tests:
+        try:
+            test()
+            print(f"  PASS: {test.__name__}")
+            passed += 1
+        except AssertionError as e:
+            print(f"  FAIL: {test.__name__} - {e}")
+            failed += 1
+        except Exception as e:
+            print(f"  ERROR: {test.__name__} - {e}")
+            failed += 1
+    print(f"\n{passed} passed, {failed} failed")
+    sys.exit(1 if failed else 0)