feat: add DOCX to text conversion helper script with tests
This commit is contained in:
83
skills/tabular-extract/scripts/convert_docx.py
Executable file
83
skills/tabular-extract/scripts/convert_docx.py
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Convert a DOCX file to plain text.
|
||||||
|
|
||||||
|
Extracts text from paragraphs and tables.
|
||||||
|
Outputs to stdout for piping into other tools.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
convert_docx.py <path-to-file.docx>
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
pip install python-docx
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def convert_docx(filepath: str) -> str:
|
||||||
|
"""Extract text from a DOCX file, including paragraphs and tables."""
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
"Error: python-docx is not installed.\n"
|
||||||
|
"Install it with: pip install python-docx",
|
||||||
|
file=sys.stderr
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
doc = Document(filepath)
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
for element in doc.element.body:
|
||||||
|
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
||||||
|
|
||||||
|
if tag == "p":
|
||||||
|
# Paragraph
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para._element is element:
|
||||||
|
text = para.text.strip()
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
break
|
||||||
|
|
||||||
|
elif tag == "tbl":
|
||||||
|
# Table
|
||||||
|
for table in doc.tables:
|
||||||
|
if table._element is element:
|
||||||
|
for row in table.rows:
|
||||||
|
cells = [cell.text.strip() for cell in row.cells]
|
||||||
|
parts.append(" | ".join(cells))
|
||||||
|
parts.append("") # blank line after table
|
||||||
|
break
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
filepath = sys.argv[1]
|
||||||
|
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
print(f"Error: File not found: {filepath}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not filepath.lower().endswith(".docx"):
|
||||||
|
print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = convert_docx(filepath)
|
||||||
|
print(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error converting file: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
122
skills/tabular-extract/scripts/test_convert_docx.py
Normal file
122
skills/tabular-extract/scripts/test_convert_docx.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Tests for convert_docx.py"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
SCRIPT = os.path.join(os.path.dirname(__file__), "convert_docx.py")
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_argument():
|
||||||
|
"""Script should print usage and exit 1 when no args given."""
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, SCRIPT],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
assert result.returncode == 1
|
||||||
|
assert "Usage:" in result.stderr
|
||||||
|
|
||||||
|
|
||||||
|
def test_nonexistent_file():
|
||||||
|
"""Script should error on a file that doesn't exist."""
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, SCRIPT, "/tmp/nonexistent_file_abc123.docx"],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
assert result.returncode == 1
|
||||||
|
assert "Error" in result.stderr or "not found" in result.stderr.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_docx_file():
|
||||||
|
"""Script should error on a non-DOCX file."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
|
||||||
|
f.write(b"hello world")
|
||||||
|
f.flush()
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, SCRIPT, f.name],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
os.unlink(f.name)
|
||||||
|
assert result.returncode == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_valid_docx():
|
||||||
|
"""Script should extract text from a valid DOCX file."""
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
except ImportError:
|
||||||
|
print("SKIP: python-docx not installed")
|
||||||
|
return
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("Hello from test document")
|
||||||
|
doc.add_paragraph("Second paragraph here")
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
|
||||||
|
doc.save(f.name)
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, SCRIPT, f.name],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
os.unlink(f.name)
|
||||||
|
|
||||||
|
assert result.returncode == 0
|
||||||
|
assert "Hello from test document" in result.stdout
|
||||||
|
assert "Second paragraph here" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
def test_docx_with_table():
|
||||||
|
"""Script should extract table content from a DOCX file."""
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
except ImportError:
|
||||||
|
print("SKIP: python-docx not installed")
|
||||||
|
return
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("Before table")
|
||||||
|
table = doc.add_table(rows=2, cols=2)
|
||||||
|
table.cell(0, 0).text = "Header1"
|
||||||
|
table.cell(0, 1).text = "Header2"
|
||||||
|
table.cell(1, 0).text = "Value1"
|
||||||
|
table.cell(1, 1).text = "Value2"
|
||||||
|
doc.add_paragraph("After table")
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
|
||||||
|
doc.save(f.name)
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, SCRIPT, f.name],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
os.unlink(f.name)
|
||||||
|
|
||||||
|
assert result.returncode == 0
|
||||||
|
assert "Header1" in result.stdout
|
||||||
|
assert "Value1" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
tests = [
|
||||||
|
test_missing_argument,
|
||||||
|
test_nonexistent_file,
|
||||||
|
test_non_docx_file,
|
||||||
|
test_valid_docx,
|
||||||
|
test_docx_with_table,
|
||||||
|
]
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
for test in tests:
|
||||||
|
try:
|
||||||
|
test()
|
||||||
|
print(f" PASS: {test.__name__}")
|
||||||
|
passed += 1
|
||||||
|
except AssertionError as e:
|
||||||
|
print(f" FAIL: {test.__name__} - {e}")
|
||||||
|
failed += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {test.__name__} - {e}")
|
||||||
|
failed += 1
|
||||||
|
print(f"\n{passed} passed, {failed} failed")
|
||||||
|
sys.exit(1 if failed else 0)
|
||||||
Reference in New Issue
Block a user