tabular-extract/scripts/convert_docx.py

#!/usr/bin/env python3
"""
Convert a DOCX file to plain text.

Extracts text from paragraphs and tables.
Outputs to stdout for piping into other tools.

Usage:
    convert_docx.py <path-to-file.docx>

Requires:
    pip install python-docx
"""

import sys
import os


def convert_docx(filepath: str) -> str:
    """Extract text from a DOCX file, including paragraphs and tables."""
    try:
        from docx import Document
    except ImportError:
        print(
            "Error: python-docx is not installed.\n"
            "Install it with: pip install python-docx",
            file=sys.stderr
        )
        sys.exit(1)

    doc = Document(filepath)
    parts = []

    for element in doc.element.body:
        tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag

        if tag == "p":
            # Paragraph
            for para in doc.paragraphs:
                if para._element is element:
                    text = para.text.strip()
                    if text:
                        parts.append(text)
                    break

        elif tag == "tbl":
            # Table
            for table in doc.tables:
                if table._element is element:
                    for row in table.rows:
                        cells = [cell.text.strip() for cell in row.cells]
                        parts.append(" | ".join(cells))
                    parts.append("")  # blank line after table
                    break

    return "\n".join(parts)


def main():
    if len(sys.argv) != 2:
        print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
        sys.exit(1)

    filepath = sys.argv[1]

    if not os.path.exists(filepath):
        print(f"Error: File not found: {filepath}", file=sys.stderr)
        sys.exit(1)

    if not filepath.lower().endswith(".docx"):
        print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
        sys.exit(1)

    try:
        text = convert_docx(filepath)
        print(text)
    except Exception as e:
        print(f"Error converting file: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()