Files
tabular-extract/scripts/convert_docx.py
Bill Ballou be5b36fbc4 Initial commit: tabular-extract skill
Claude Code skill that extracts structured data from document
collections into tabular format using Claude's native document
understanding capabilities.
2026-03-02 23:56:28 -05:00

84 lines
2.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Convert a DOCX file to plain text.
Extracts text from paragraphs and tables.
Outputs to stdout for piping into other tools.
Usage:
convert_docx.py <path-to-file.docx>
Requires:
pip install python-docx
"""
import sys
import os
def convert_docx(filepath: str) -> str:
"""Extract text from a DOCX file, including paragraphs and tables."""
try:
from docx import Document
except ImportError:
print(
"Error: python-docx is not installed.\n"
"Install it with: pip install python-docx",
file=sys.stderr
)
sys.exit(1)
doc = Document(filepath)
parts = []
for element in doc.element.body:
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
if tag == "p":
# Paragraph
for para in doc.paragraphs:
if para._element is element:
text = para.text.strip()
if text:
parts.append(text)
break
elif tag == "tbl":
# Table
for table in doc.tables:
if table._element is element:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
parts.append(" | ".join(cells))
parts.append("") # blank line after table
break
return "\n".join(parts)
def main():
if len(sys.argv) != 2:
print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
sys.exit(1)
filepath = sys.argv[1]
if not os.path.exists(filepath):
print(f"Error: File not found: {filepath}", file=sys.stderr)
sys.exit(1)
if not filepath.lower().endswith(".docx"):
print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
sys.exit(1)
try:
text = convert_docx(filepath)
print(text)
except Exception as e:
print(f"Error converting file: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()