From 41f7973a6f551136b1d530af7cc6c0895195c992 Mon Sep 17 00:00:00 2001 From: Bill Ballou Date: Mon, 2 Mar 2026 23:57:05 -0500 Subject: [PATCH] Convert tabular-extract skill to git submodule Move tabular-extract from directly tracked files to a submodule pointing at https://git.prettyhefty.com/Bill/tabular-extract.git --- .gitmodules | 3 + skills/tabular-extract | 1 + skills/tabular-extract/SKILL.md | 97 -------------- .../references/extraction-guide.md | 94 -------------- .../tabular-extract/scripts/convert_docx.py | 83 ------------ .../scripts/test_convert_docx.py | 122 ------------------ 6 files changed, 4 insertions(+), 396 deletions(-) create mode 160000 skills/tabular-extract delete mode 100644 skills/tabular-extract/SKILL.md delete mode 100644 skills/tabular-extract/references/extraction-guide.md delete mode 100755 skills/tabular-extract/scripts/convert_docx.py delete mode 100644 skills/tabular-extract/scripts/test_convert_docx.py diff --git a/.gitmodules b/.gitmodules index 0689904..2e61d82 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "skills/docker-service-architecture"] path = skills/docker-service-architecture url = https://git.prettyhefty.com/Bill/docker-service-architecture-skill.git +[submodule "skills/tabular-extract"] + path = skills/tabular-extract + url = https://git.prettyhefty.com/Bill/tabular-extract.git diff --git a/skills/tabular-extract b/skills/tabular-extract new file mode 160000 index 0000000..be5b36f --- /dev/null +++ b/skills/tabular-extract @@ -0,0 +1 @@ +Subproject commit be5b36fbc458473a0c62c42af2d9ff8907bead5d diff --git a/skills/tabular-extract/SKILL.md b/skills/tabular-extract/SKILL.md deleted file mode 100644 index f5924ad..0000000 --- a/skills/tabular-extract/SKILL.md +++ /dev/null @@ -1,97 +0,0 @@ ---- -name: tabular-extract -description: Extract structured data from document collections into tabular format. Reads PDFs, DOCX, TXT, MD, and JSON files from local paths, infers extraction columns from natural language descriptions, and outputs a markdown table plus a JSON file with values, confidence scores, supporting quotes, and reasoning. Use when the user asks to extract structured data from documents, turn documents into a spreadsheet or table, review or compare multiple documents side by side, or pull specific fields from a set of files. ---- - -# Tabular Extract - -Extract structured data from document collections into tabular format. - -## Pipeline - -This is a rigid, sequential pipeline. Execute every step in order. - -1. **Discover documents** — find files at the user's path -2. **Read documents** — convert each file to text -3. **Define schema** — infer extraction columns from user's description -4. **Extract data** — read each document and extract each column's value -5. **Output results** — display markdown table and save JSON file - -## Step 1: Discover Documents - -Glob the user-provided path for supported file types: - -```bash -**/*.pdf **/*.docx **/*.txt **/*.md **/*.json -``` - -Display the file list and count. Ask the user to confirm before proceeding. -If no supported files are found, tell the user and stop. - -## Step 2: Read Documents - -Convert each file to text based on its type: - -| Format | Method | -|--------|--------| -| .pdf | Use the Read tool with `pages` parameter for large files (>10 pages: read in chunks of 20 pages) | -| .docx | Run: `python3 ~/.claude/skills/tabular-extract/scripts/convert_docx.py ` (requires `pip install python-docx`) | -| .txt, .md | Use the Read tool directly | -| .json | Use the Read tool directly | - -If a file fails to convert, log it as skipped and continue with remaining files. Do not stop the pipeline. - -## Step 3: Define Extraction Schema - -The user describes what to extract in natural language. - -Infer a structured schema — for each column determine: -- **name**: Short, descriptive column header -- **type**: One of `text`, `number`, `date`, `boolean`, `list` -- **prompt**: Specific extraction instruction - -Present the inferred schema as a table and ask the user to confirm or adjust. - -Example: -``` -| # | Column | Type | Extraction Prompt | -|---|--------|------|-------------------| -| 1 | Party Name | text | Identify the full legal name of each party to the agreement | -| 2 | Effective Date | date | What is the effective date of this agreement? | -| 3 | Contract Value | number | What is the total contract value or consideration amount? | -``` - -## Step 4: Extract Data - -For each document, read its text and extract every column value. - -For each cell, produce: -- **value** — the extracted data (typed per column type) -- **confidence** — high, medium, or low -- **supporting_quote** — exact text from the document -- **reasoning** — why this value was chosen - -See `references/extraction-guide.md` for detailed type handling, confidence criteria, and null value handling. - -## Step 5: Output Results - -**Display a markdown table** in the conversation: -- One row per document, one column per extraction field -- Append `(?)` to low-confidence values -- Truncate values longer than 60 characters with `...` - -**Save a JSON file** to `./extraction-results-YYYY-MM-DD.json` in the current working directory. -- Use the schema documented in `references/extraction-guide.md` -- Include metadata: timestamp, source path, document count, skipped files - -**Print a summary:** -- Documents processed / skipped -- Confidence distribution (how many high / medium / low extractions) - -## Error Handling - -- **Missing python-docx**: Print install command `pip install python-docx` and ask user to install -- **Unreadable file**: Skip file, record in skipped list, continue pipeline -- **Large PDF (>10 pages)**: Read in 20-page chunks, concatenate text -- **No files found**: Inform user and stop -- **User cancels at confirmation**: Stop gracefully diff --git a/skills/tabular-extract/references/extraction-guide.md b/skills/tabular-extract/references/extraction-guide.md deleted file mode 100644 index 776a9be..0000000 --- a/skills/tabular-extract/references/extraction-guide.md +++ /dev/null @@ -1,94 +0,0 @@ -# Extraction Guide - -## Extraction Prompt Template - -For each document x column, use this reasoning structure: - -1. Read the document text carefully -2. Locate text relevant to the extraction prompt -3. Extract the value, noting its exact location -4. Assess confidence based on clarity of the source text - -## Per-Cell Output Structure - -For each extraction, produce a JSON object: - -```json -{ - "value": "", - "confidence": "high | medium | low", - "supporting_quote": "", - "reasoning": "<1-2 sentences explaining why this value was chosen>" -} -``` - -### Confidence Levels - -- **high**: Value is explicitly stated, unambiguous, directly answers the prompt -- **medium**: Value is implied or requires minor inference, or multiple possible values exist -- **low**: Value is uncertain, requires significant inference, or the document may not contain the answer - -### Type Handling - -| Column Type | Value Format | Example | -|-------------|-------------|---------| -| text | Plain string | "Acme Corporation" | -| number | Numeric value (no currency symbols) | 500000 | -| date | ISO 8601 format (YYYY-MM-DD) | "2024-01-15" | -| boolean | true or false | true | -| list | JSON array of strings | ["item1", "item2"] | - -### When a Value Cannot Be Found - -If the document does not contain information for a column: -- Set value to null -- Set confidence to "low" -- Set supporting_quote to "" -- Set reasoning to explain why the value could not be found - -## Full Output JSON Schema - -```json -{ - "extraction": { - "created": "ISO 8601 timestamp", - "source_directory": "/absolute/path/to/docs", - "documents_processed": 0, - "documents_skipped": [], - "columns": [ - { - "name": "Column Name", - "type": "text|number|date|boolean|list", - "prompt": "The extraction prompt used" - } - ], - "results": [ - { - "document": "filename.pdf", - "fields": { - "Column Name": { - "value": "extracted value", - "confidence": "high|medium|low", - "supporting_quote": "exact text from document", - "reasoning": "explanation" - } - } - } - ] - } -} -``` - -## Markdown Table Format - -Display results as a pipe-delimited markdown table. -Append `(?)` to low-confidence values. -Truncate cell values longer than 60 characters with `...`. - -Example: -``` -| Document | Party Name | Date | Amount | -|----------|-----------|------|--------| -| contract1.pdf | Acme Corp | 2024-01-15 | 500000 | -| contract2.pdf | Beta LLC(?) | 2024-03-22 | 1200000 | -``` diff --git a/skills/tabular-extract/scripts/convert_docx.py b/skills/tabular-extract/scripts/convert_docx.py deleted file mode 100755 index 50efd9e..0000000 --- a/skills/tabular-extract/scripts/convert_docx.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert a DOCX file to plain text. - -Extracts text from paragraphs and tables. -Outputs to stdout for piping into other tools. - -Usage: - convert_docx.py - -Requires: - pip install python-docx -""" - -import sys -import os - - -def convert_docx(filepath: str) -> str: - """Extract text from a DOCX file, including paragraphs and tables.""" - try: - from docx import Document - except ImportError: - print( - "Error: python-docx is not installed.\n" - "Install it with: pip install python-docx", - file=sys.stderr - ) - sys.exit(1) - - doc = Document(filepath) - parts = [] - - for element in doc.element.body: - tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag - - if tag == "p": - # Paragraph - for para in doc.paragraphs: - if para._element is element: - text = para.text.strip() - if text: - parts.append(text) - break - - elif tag == "tbl": - # Table - for table in doc.tables: - if table._element is element: - for row in table.rows: - cells = [cell.text.strip() for cell in row.cells] - parts.append(" | ".join(cells)) - parts.append("") # blank line after table - break - - return "\n".join(parts) - - -def main(): - if len(sys.argv) != 2: - print("Usage: convert_docx.py ", file=sys.stderr) - sys.exit(1) - - filepath = sys.argv[1] - - if not os.path.exists(filepath): - print(f"Error: File not found: {filepath}", file=sys.stderr) - sys.exit(1) - - if not filepath.lower().endswith(".docx"): - print(f"Error: Not a .docx file: {filepath}", file=sys.stderr) - sys.exit(1) - - try: - text = convert_docx(filepath) - print(text) - except Exception as e: - print(f"Error converting file: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/skills/tabular-extract/scripts/test_convert_docx.py b/skills/tabular-extract/scripts/test_convert_docx.py deleted file mode 100644 index 10d35af..0000000 --- a/skills/tabular-extract/scripts/test_convert_docx.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for convert_docx.py""" - -import subprocess -import sys -import tempfile -import os - -SCRIPT = os.path.join(os.path.dirname(__file__), "convert_docx.py") - - -def test_missing_argument(): - """Script should print usage and exit 1 when no args given.""" - result = subprocess.run( - [sys.executable, SCRIPT], - capture_output=True, text=True - ) - assert result.returncode == 1 - assert "Usage:" in result.stderr - - -def test_nonexistent_file(): - """Script should error on a file that doesn't exist.""" - result = subprocess.run( - [sys.executable, SCRIPT, "/tmp/nonexistent_file_abc123.docx"], - capture_output=True, text=True - ) - assert result.returncode == 1 - assert "Error" in result.stderr or "not found" in result.stderr.lower() - - -def test_non_docx_file(): - """Script should error on a non-DOCX file.""" - with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f: - f.write(b"hello world") - f.flush() - result = subprocess.run( - [sys.executable, SCRIPT, f.name], - capture_output=True, text=True - ) - os.unlink(f.name) - assert result.returncode == 1 - - -def test_valid_docx(): - """Script should extract text from a valid DOCX file.""" - try: - from docx import Document - except ImportError: - print("SKIP: python-docx not installed") - return - - doc = Document() - doc.add_paragraph("Hello from test document") - doc.add_paragraph("Second paragraph here") - - with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: - doc.save(f.name) - result = subprocess.run( - [sys.executable, SCRIPT, f.name], - capture_output=True, text=True - ) - os.unlink(f.name) - - assert result.returncode == 0 - assert "Hello from test document" in result.stdout - assert "Second paragraph here" in result.stdout - - -def test_docx_with_table(): - """Script should extract table content from a DOCX file.""" - try: - from docx import Document - except ImportError: - print("SKIP: python-docx not installed") - return - - doc = Document() - doc.add_paragraph("Before table") - table = doc.add_table(rows=2, cols=2) - table.cell(0, 0).text = "Header1" - table.cell(0, 1).text = "Header2" - table.cell(1, 0).text = "Value1" - table.cell(1, 1).text = "Value2" - doc.add_paragraph("After table") - - with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: - doc.save(f.name) - result = subprocess.run( - [sys.executable, SCRIPT, f.name], - capture_output=True, text=True - ) - os.unlink(f.name) - - assert result.returncode == 0 - assert "Header1" in result.stdout - assert "Value1" in result.stdout - - -if __name__ == "__main__": - tests = [ - test_missing_argument, - test_nonexistent_file, - test_non_docx_file, - test_valid_docx, - test_docx_with_table, - ] - passed = 0 - failed = 0 - for test in tests: - try: - test() - print(f" PASS: {test.__name__}") - passed += 1 - except AssertionError as e: - print(f" FAIL: {test.__name__} - {e}") - failed += 1 - except Exception as e: - print(f" ERROR: {test.__name__} - {e}") - failed += 1 - print(f"\n{passed} passed, {failed} failed") - sys.exit(1 if failed else 0)