#!/usr/bin/env python3 """ Convert a DOCX file to plain text. Extracts text from paragraphs and tables. Outputs to stdout for piping into other tools. Usage: convert_docx.py Requires: pip install python-docx """ import sys import os def convert_docx(filepath: str) -> str: """Extract text from a DOCX file, including paragraphs and tables.""" try: from docx import Document except ImportError: print( "Error: python-docx is not installed.\n" "Install it with: pip install python-docx", file=sys.stderr ) sys.exit(1) doc = Document(filepath) parts = [] for element in doc.element.body: tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag if tag == "p": # Paragraph for para in doc.paragraphs: if para._element is element: text = para.text.strip() if text: parts.append(text) break elif tag == "tbl": # Table for table in doc.tables: if table._element is element: for row in table.rows: cells = [cell.text.strip() for cell in row.cells] parts.append(" | ".join(cells)) parts.append("") # blank line after table break return "\n".join(parts) def main(): if len(sys.argv) != 2: print("Usage: convert_docx.py ", file=sys.stderr) sys.exit(1) filepath = sys.argv[1] if not os.path.exists(filepath): print(f"Error: File not found: {filepath}", file=sys.stderr) sys.exit(1) if not filepath.lower().endswith(".docx"): print(f"Error: Not a .docx file: {filepath}", file=sys.stderr) sys.exit(1) try: text = convert_docx(filepath) print(text) except Exception as e: print(f"Error converting file: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()