Files
AI-Trader/data/merge_jsonl.py
Bill 11509ba8c7 fix: merge script now writes to current directory for volume compatibility
Changed merge_jsonl.py to use os.getcwd() instead of os.path.dirname(__file__)
to ensure merged.jsonl is written to the working directory where data files exist,
not to the script's installation directory.

Root cause:
- Dockerfile copies scripts to /app/scripts/ for volume compatibility
- entrypoint.sh runs: cd /app/data && python /app/scripts/merge_jsonl.py
- Old logic used script directory (/app/scripts/), ignoring working directory
- This caused merged.jsonl to be created in /app/scripts/ instead of /app/data/
- Since /app/data/ is volume-mounted, merged file was not visible to host

Solution:
- Scripts now respect current working directory (Unix philosophy)
- Works correctly with volume mounts and script relocation
- Tested in both local and Docker directory structure scenarios

Fixes the issue where merged.jsonl was missing from mounted data volume.
2025-10-31 00:03:39 -04:00

69 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import os
import glob
all_nasdaq_100_symbols = [
"NVDA", "MSFT", "AAPL", "GOOG", "GOOGL", "AMZN", "META", "AVGO", "TSLA",
"NFLX", "PLTR", "COST", "ASML", "AMD", "CSCO", "AZN", "TMUS", "MU", "LIN",
"PEP", "SHOP", "APP", "INTU", "AMAT", "LRCX", "PDD", "QCOM", "ARM", "INTC",
"BKNG", "AMGN", "TXN", "ISRG", "GILD", "KLAC", "PANW", "ADBE", "HON",
"CRWD", "CEG", "ADI", "ADP", "DASH", "CMCSA", "VRTX", "MELI", "SBUX",
"CDNS", "ORLY", "SNPS", "MSTR", "MDLZ", "ABNB", "MRVL", "CTAS", "TRI",
"MAR", "MNST", "CSX", "ADSK", "PYPL", "FTNT", "AEP", "WDAY", "REGN", "ROP",
"NXPI", "DDOG", "AXON", "ROST", "IDXX", "EA", "PCAR", "FAST", "EXC", "TTWO",
"XEL", "ZS", "PAYX", "WBD", "BKR", "CPRT", "CCEP", "FANG", "TEAM", "CHTR",
"KDP", "MCHP", "GEHC", "VRSK", "CTSH", "CSGP", "KHC", "ODFL", "DXCM", "TTD",
"ON", "BIIB", "LULU", "CDW", "GFS"
]
# 合并所有以 daily_price 开头的 json逐文件一行写入 merged.jsonl
# Use current working directory instead of script directory for volume compatibility
current_dir = os.getcwd()
pattern = os.path.join(current_dir, 'daily_price*.json')
files = sorted(glob.glob(pattern))
output_file = os.path.join(current_dir, 'merged.jsonl')
with open(output_file, 'w', encoding='utf-8') as fout:
for fp in files:
basename = os.path.basename(fp)
# 仅当文件名包含任一纳指100成分符号时才写入
if not any(symbol in basename for symbol in all_nasdaq_100_symbols):
continue
with open(fp, 'r', encoding='utf-8') as f:
data = json.load(f)
# 统一重命名:"1. open" -> "1. buy price""4. close" -> "4. sell price"
# 对于最新的一天,只保留并写入 "1. buy price"
try:
# 查找所有以 "Time Series" 开头的键
series = None
for key, value in data.items():
if key.startswith("Time Series"):
series = value
break
if isinstance(series, dict) and series:
# 先对所有日期做键名重命名
for d, bar in list(series.items()):
if not isinstance(bar, dict):
continue
if "1. open" in bar:
bar["1. buy price"] = bar.pop("1. open")
if "4. close" in bar:
bar["4. sell price"] = bar.pop("4. close")
# 再处理最新日期,仅保留买入价
latest_date = max(series.keys())
latest_bar = series.get(latest_date, {})
if isinstance(latest_bar, dict):
buy_val = latest_bar.get("1. buy price")
series[latest_date] = {"1. buy price": buy_val} if buy_val is not None else {}
# 更新 Meta Data 描述
meta = data.get("Meta Data", {})
if isinstance(meta, dict):
meta["1. Information"] = "Daily Prices (buy price, high, low, sell price) and Volumes"
except Exception:
# 若结构异常则原样写入
pass
fout.write(json.dumps(data, ensure_ascii=False) + "\n")