Claude Code skill that extracts structured data from document collections into tabular format using Claude's native document understanding capabilities.
84 lines
2.1 KiB
Python
Executable File
84 lines
2.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Convert a DOCX file to plain text.
|
|
|
|
Extracts text from paragraphs and tables.
|
|
Outputs to stdout for piping into other tools.
|
|
|
|
Usage:
|
|
convert_docx.py <path-to-file.docx>
|
|
|
|
Requires:
|
|
pip install python-docx
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
|
|
def convert_docx(filepath: str) -> str:
|
|
"""Extract text from a DOCX file, including paragraphs and tables."""
|
|
try:
|
|
from docx import Document
|
|
except ImportError:
|
|
print(
|
|
"Error: python-docx is not installed.\n"
|
|
"Install it with: pip install python-docx",
|
|
file=sys.stderr
|
|
)
|
|
sys.exit(1)
|
|
|
|
doc = Document(filepath)
|
|
parts = []
|
|
|
|
for element in doc.element.body:
|
|
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
|
|
|
if tag == "p":
|
|
# Paragraph
|
|
for para in doc.paragraphs:
|
|
if para._element is element:
|
|
text = para.text.strip()
|
|
if text:
|
|
parts.append(text)
|
|
break
|
|
|
|
elif tag == "tbl":
|
|
# Table
|
|
for table in doc.tables:
|
|
if table._element is element:
|
|
for row in table.rows:
|
|
cells = [cell.text.strip() for cell in row.cells]
|
|
parts.append(" | ".join(cells))
|
|
parts.append("") # blank line after table
|
|
break
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: convert_docx.py <path-to-file.docx>", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
filepath = sys.argv[1]
|
|
|
|
if not os.path.exists(filepath):
|
|
print(f"Error: File not found: {filepath}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not filepath.lower().endswith(".docx"):
|
|
print(f"Error: Not a .docx file: {filepath}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
text = convert_docx(filepath)
|
|
print(text)
|
|
except Exception as e:
|
|
print(f"Error converting file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|