commit be5b36fbc458473a0c62c42af2d9ff8907bead5d Author: Bill Ballou Date: Mon Mar 2 23:56:28 2026 -0500 Initial commit: tabular-extract skill Claude Code skill that extracts structured data from document collections into tabular format using Claude's native document understanding capabilities. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..f5924ad --- /dev/null +++ b/SKILL.md @@ -0,0 +1,97 @@ +--- +name: tabular-extract +description: Extract structured data from document collections into tabular format. Reads PDFs, DOCX, TXT, MD, and JSON files from local paths, infers extraction columns from natural language descriptions, and outputs a markdown table plus a JSON file with values, confidence scores, supporting quotes, and reasoning. Use when the user asks to extract structured data from documents, turn documents into a spreadsheet or table, review or compare multiple documents side by side, or pull specific fields from a set of files. +--- + +# Tabular Extract + +Extract structured data from document collections into tabular format. + +## Pipeline + +This is a rigid, sequential pipeline. Execute every step in order. + +1. **Discover documents** — find files at the user's path +2. **Read documents** — convert each file to text +3. **Define schema** — infer extraction columns from user's description +4. **Extract data** — read each document and extract each column's value +5. **Output results** — display markdown table and save JSON file + +## Step 1: Discover Documents + +Glob the user-provided path for supported file types: + +```bash +**/*.pdf **/*.docx **/*.txt **/*.md **/*.json +``` + +Display the file list and count. Ask the user to confirm before proceeding. +If no supported files are found, tell the user and stop. + +## Step 2: Read Documents + +Convert each file to text based on its type: + +| Format | Method | +|--------|--------| +| .pdf | Use the Read tool with `pages` parameter for large files (>10 pages: read in chunks of 20 pages) | +| .docx | Run: `python3 ~/.claude/skills/tabular-extract/scripts/convert_docx.py ` (requires `pip install python-docx`) | +| .txt, .md | Use the Read tool directly | +| .json | Use the Read tool directly | + +If a file fails to convert, log it as skipped and continue with remaining files. Do not stop the pipeline. + +## Step 3: Define Extraction Schema + +The user describes what to extract in natural language. + +Infer a structured schema — for each column determine: +- **name**: Short, descriptive column header +- **type**: One of `text`, `number`, `date`, `boolean`, `list` +- **prompt**: Specific extraction instruction + +Present the inferred schema as a table and ask the user to confirm or adjust. + +Example: +``` +| # | Column | Type | Extraction Prompt | +|---|--------|------|-------------------| +| 1 | Party Name | text | Identify the full legal name of each party to the agreement | +| 2 | Effective Date | date | What is the effective date of this agreement? | +| 3 | Contract Value | number | What is the total contract value or consideration amount? | +``` + +## Step 4: Extract Data + +For each document, read its text and extract every column value. + +For each cell, produce: +- **value** — the extracted data (typed per column type) +- **confidence** — high, medium, or low +- **supporting_quote** — exact text from the document +- **reasoning** — why this value was chosen + +See `references/extraction-guide.md` for detailed type handling, confidence criteria, and null value handling. + +## Step 5: Output Results + +**Display a markdown table** in the conversation: +- One row per document, one column per extraction field +- Append `(?)` to low-confidence values +- Truncate values longer than 60 characters with `...` + +**Save a JSON file** to `./extraction-results-YYYY-MM-DD.json` in the current working directory. +- Use the schema documented in `references/extraction-guide.md` +- Include metadata: timestamp, source path, document count, skipped files + +**Print a summary:** +- Documents processed / skipped +- Confidence distribution (how many high / medium / low extractions) + +## Error Handling + +- **Missing python-docx**: Print install command `pip install python-docx` and ask user to install +- **Unreadable file**: Skip file, record in skipped list, continue pipeline +- **Large PDF (>10 pages)**: Read in 20-page chunks, concatenate text +- **No files found**: Inform user and stop +- **User cancels at confirmation**: Stop gracefully diff --git a/references/extraction-guide.md b/references/extraction-guide.md new file mode 100644 index 0000000..776a9be --- /dev/null +++ b/references/extraction-guide.md @@ -0,0 +1,94 @@ +# Extraction Guide + +## Extraction Prompt Template + +For each document x column, use this reasoning structure: + +1. Read the document text carefully +2. Locate text relevant to the extraction prompt +3. Extract the value, noting its exact location +4. Assess confidence based on clarity of the source text + +## Per-Cell Output Structure + +For each extraction, produce a JSON object: + +```json +{ + "value": "", + "confidence": "high | medium | low", + "supporting_quote": "", + "reasoning": "<1-2 sentences explaining why this value was chosen>" +} +``` + +### Confidence Levels + +- **high**: Value is explicitly stated, unambiguous, directly answers the prompt +- **medium**: Value is implied or requires minor inference, or multiple possible values exist +- **low**: Value is uncertain, requires significant inference, or the document may not contain the answer + +### Type Handling + +| Column Type | Value Format | Example | +|-------------|-------------|---------| +| text | Plain string | "Acme Corporation" | +| number | Numeric value (no currency symbols) | 500000 | +| date | ISO 8601 format (YYYY-MM-DD) | "2024-01-15" | +| boolean | true or false | true | +| list | JSON array of strings | ["item1", "item2"] | + +### When a Value Cannot Be Found + +If the document does not contain information for a column: +- Set value to null +- Set confidence to "low" +- Set supporting_quote to "" +- Set reasoning to explain why the value could not be found + +## Full Output JSON Schema + +```json +{ + "extraction": { + "created": "ISO 8601 timestamp", + "source_directory": "/absolute/path/to/docs", + "documents_processed": 0, + "documents_skipped": [], + "columns": [ + { + "name": "Column Name", + "type": "text|number|date|boolean|list", + "prompt": "The extraction prompt used" + } + ], + "results": [ + { + "document": "filename.pdf", + "fields": { + "Column Name": { + "value": "extracted value", + "confidence": "high|medium|low", + "supporting_quote": "exact text from document", + "reasoning": "explanation" + } + } + } + ] + } +} +``` + +## Markdown Table Format + +Display results as a pipe-delimited markdown table. +Append `(?)` to low-confidence values. +Truncate cell values longer than 60 characters with `...`. + +Example: +``` +| Document | Party Name | Date | Amount | +|----------|-----------|------|--------| +| contract1.pdf | Acme Corp | 2024-01-15 | 500000 | +| contract2.pdf | Beta LLC(?) | 2024-03-22 | 1200000 | +``` diff --git a/scripts/convert_docx.py b/scripts/convert_docx.py new file mode 100755 index 0000000..50efd9e --- /dev/null +++ b/scripts/convert_docx.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Convert a DOCX file to plain text. + +Extracts text from paragraphs and tables. +Outputs to stdout for piping into other tools. + +Usage: + convert_docx.py + +Requires: + pip install python-docx +""" + +import sys +import os + + +def convert_docx(filepath: str) -> str: + """Extract text from a DOCX file, including paragraphs and tables.""" + try: + from docx import Document + except ImportError: + print( + "Error: python-docx is not installed.\n" + "Install it with: pip install python-docx", + file=sys.stderr + ) + sys.exit(1) + + doc = Document(filepath) + parts = [] + + for element in doc.element.body: + tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag + + if tag == "p": + # Paragraph + for para in doc.paragraphs: + if para._element is element: + text = para.text.strip() + if text: + parts.append(text) + break + + elif tag == "tbl": + # Table + for table in doc.tables: + if table._element is element: + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + parts.append(" | ".join(cells)) + parts.append("") # blank line after table + break + + return "\n".join(parts) + + +def main(): + if len(sys.argv) != 2: + print("Usage: convert_docx.py ", file=sys.stderr) + sys.exit(1) + + filepath = sys.argv[1] + + if not os.path.exists(filepath): + print(f"Error: File not found: {filepath}", file=sys.stderr) + sys.exit(1) + + if not filepath.lower().endswith(".docx"): + print(f"Error: Not a .docx file: {filepath}", file=sys.stderr) + sys.exit(1) + + try: + text = convert_docx(filepath) + print(text) + except Exception as e: + print(f"Error converting file: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/test_convert_docx.py b/scripts/test_convert_docx.py new file mode 100644 index 0000000..10d35af --- /dev/null +++ b/scripts/test_convert_docx.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Tests for convert_docx.py""" + +import subprocess +import sys +import tempfile +import os + +SCRIPT = os.path.join(os.path.dirname(__file__), "convert_docx.py") + + +def test_missing_argument(): + """Script should print usage and exit 1 when no args given.""" + result = subprocess.run( + [sys.executable, SCRIPT], + capture_output=True, text=True + ) + assert result.returncode == 1 + assert "Usage:" in result.stderr + + +def test_nonexistent_file(): + """Script should error on a file that doesn't exist.""" + result = subprocess.run( + [sys.executable, SCRIPT, "/tmp/nonexistent_file_abc123.docx"], + capture_output=True, text=True + ) + assert result.returncode == 1 + assert "Error" in result.stderr or "not found" in result.stderr.lower() + + +def test_non_docx_file(): + """Script should error on a non-DOCX file.""" + with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f: + f.write(b"hello world") + f.flush() + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + assert result.returncode == 1 + + +def test_valid_docx(): + """Script should extract text from a valid DOCX file.""" + try: + from docx import Document + except ImportError: + print("SKIP: python-docx not installed") + return + + doc = Document() + doc.add_paragraph("Hello from test document") + doc.add_paragraph("Second paragraph here") + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + doc.save(f.name) + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + + assert result.returncode == 0 + assert "Hello from test document" in result.stdout + assert "Second paragraph here" in result.stdout + + +def test_docx_with_table(): + """Script should extract table content from a DOCX file.""" + try: + from docx import Document + except ImportError: + print("SKIP: python-docx not installed") + return + + doc = Document() + doc.add_paragraph("Before table") + table = doc.add_table(rows=2, cols=2) + table.cell(0, 0).text = "Header1" + table.cell(0, 1).text = "Header2" + table.cell(1, 0).text = "Value1" + table.cell(1, 1).text = "Value2" + doc.add_paragraph("After table") + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + doc.save(f.name) + result = subprocess.run( + [sys.executable, SCRIPT, f.name], + capture_output=True, text=True + ) + os.unlink(f.name) + + assert result.returncode == 0 + assert "Header1" in result.stdout + assert "Value1" in result.stdout + + +if __name__ == "__main__": + tests = [ + test_missing_argument, + test_nonexistent_file, + test_non_docx_file, + test_valid_docx, + test_docx_with_table, + ] + passed = 0 + failed = 0 + for test in tests: + try: + test() + print(f" PASS: {test.__name__}") + passed += 1 + except AssertionError as e: + print(f" FAIL: {test.__name__} - {e}") + failed += 1 + except Exception as e: + print(f" ERROR: {test.__name__} - {e}") + failed += 1 + print(f"\n{passed} passed, {failed} failed") + sys.exit(1 if failed else 0)