84 lines
2.1 KiB
Python
Executable File
84 lines
2.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Convert a DOCX file to plain text.
|
|
|
|
Extracts text from paragraphs and tables.
|
|
Outputs to stdout for piping into other tools.
|
|
|
|
Usage:
|
|
convert_docx.py <path-to-file.docx>
|
|
|
|
Requires:
|
|
pip install python-docx
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
|
|
def convert_docx(filepath: str) -> str:
|
|
"""Extract text from a DOCX file, including paragraphs and tables."""
|
|
try:
|
|
from docx import Document
|
|
except ImportError:
|
|
print(
|
|
"Error: python-docx is not installed.\n"
|
|
"Install it with: pip install python-docx",
|
|
file=sys.stderr
|
|
)
|
|
sys.exit(1)
|
|
|
|
doc = Document(filepath)
|
|
parts = []
|
|
|
|
for element in doc.element.body:
|
|
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
|
|
|
if tag == "p":
|
|
# Paragraph
|
|
for para in doc.paragraphs:
|
|
if para._element is element:
|
|
text = para.text.strip()
|
|
if text:
|
|
parts.append(text)
|
|
break
|
|
|
|
elif tag == "tbl":
|
|
# Table
|
|
for table in doc.tables:
|
|
if table._element is element:
|
|
for row in table.rows:
|
|
cells = [cell.text.strip() for cell in row.cells]
|
|
parts.append(" | ".join(cells))
|
|
parts.append("") # blank line after table
|
|
break
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
filepath = sys.argv[1]
|
|
|
|
if not os.path.exists(filepath):
|
|
print(f"Error: File not found: {filepath}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not filepath.lower().endswith(".docx"):
|
|
print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
text = convert_docx(filepath)
|
|
print(text)
|
|
except Exception as e:
|
|
print(f"Error converting file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|