''' ๐Ÿ“ data_collection_engine/collect_all_cds.py S&P500 ๋“ฑ ์—ฌ๋Ÿฌ ์ข…๋ชฉ์— ๋Œ€ํ•ด CDS ๋ฐ์ดํ„ฐ๋ฅผ ์ผ๊ด„ ์ˆ˜์ง‘ํ•˜๊ณ , XGBoost ํ•™์Šต์„ ์œ„ํ•œ ํ†ตํ•ฉ ๋ฐ์ดํ„ฐ์…‹๋„ ์ž๋™ ์ƒ์„ฑํ•˜๋Š” ์Šคํฌ๋ฆฝํŠธ ''' import os import time import pandas as pd from data_collection_engine.engine import DataCollectionEngine from data_analysis_engine.dataset_builder import build_dataset def collect_all_cds(ticker_list_path: str, start: str, end: str, save_dir: str = "data", delay_sec: int = 13): """ ์—ฌ๋Ÿฌ ์ข…๋ชฉ์˜ CDS ๋ฐ์ดํ„ฐ๋ฅผ ์ผ๊ด„ ์ˆ˜์ง‘ํ•˜์—ฌ ์ €์žฅํ•˜๊ณ , ํ•™์Šต์šฉ ํ†ตํ•ฉ ๋ฐ์ดํ„ฐ์…‹๋„ ์ƒ์„ฑ Parameters: ticker_list_path (str): ์ˆ˜์ง‘ํ•  ์ข…๋ชฉ ์ฝ”๋“œ๊ฐ€ ๋“ค์–ด ์žˆ๋Š” CSV ํŒŒ์ผ ๊ฒฝ๋กœ start (str): ์‹œ์ž‘์ผ (YYYY-MM-DD) end (str): ์ข…๋ฃŒ์ผ (YYYY-MM-DD) save_dir (str): ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ delay_sec (int): API ํ˜ธ์ถœ ๊ฐ„๊ฒฉ (๊ธฐ๋ณธ 13์ดˆ) """ df = pd.read_csv(ticker_list_path) # ์‚ฌ์šฉ์ž์—๊ฒŒ ํƒ€์ž… ์„ ํƒ ์˜ต์…˜ ์ œ๊ณต print("\n๐Ÿ” ์„ ํƒ ๊ฐ€๋Šฅํ•œ ์ข…๋ชฉ ์œ ํ˜•:") type_mapping = { "CS": "Common Stock - ์ผ๋ฐ˜ ์ฃผ์‹ (๊ฐ€์žฅ ๋ณดํŽธ์ ์ธ ๊ธฐ์—… ์ง€๋ถ„)", "ETF": "Exchange Traded Fund - ์ง€์ˆ˜ ์ถ”์ข…ํ˜• ์ƒ์žฅ์ง€์ˆ˜ํŽ€๋“œ", "ETN": "Exchange Traded Note - ์ฑ„๊ถŒ ๊ธฐ๋ฐ˜์˜ ์ƒ์žฅ์ง€์ˆ˜์ฆ๊ถŒ", "ADR": "American Depository Receipt - ํ•ด์™ธ ์ฃผ์‹์˜ ๋ฏธ๊ตญ ์ƒ์žฅ ๋ฒ„์ „", "PREF": "Preferred Stock - ๋ฐฐ๋‹น ์šฐ์„ ์ฃผ", "UNIT": "Unit - ETF ๊ตฌ์„ฑ ๋‹จ์œ„ ๋˜๋Š” ๋ฌถ์Œ", "RIGHT": "Rights - ์‹ ์ฃผ์ธ์ˆ˜๊ถŒ, ์ผ์ • ๊ธฐ๊ฐ„ ๋‚ด ์ฃผ์‹ ๋งค์ž… ๊ถŒ๋ฆฌ", "FUND": "Mutual Fund - ๊ณต๋ชจํŽ€๋“œ ๋˜๋Š” ํˆฌ์ž์‹ ํƒ", "SP": "Structured Product - ํŒŒ์ƒ๊ฒฐํ•ฉ์ฆ๊ถŒ", "WARRANT": "Warrant - ์ผ์ • ๊ฐ€๊ฒฉ์— ์ฃผ์‹ ๊ตฌ๋งค ๊ถŒํ•œ์„ ๋ถ€์—ฌํ•˜๋Š” ์ฆ์„œ", "INDEX": "Market Index - ์ข…ํ•ฉ ์ฃผ๊ฐ€์ง€์ˆ˜ ๋“ฑ", "BOND": "Bond - ํšŒ์‚ฌ์ฑ„ ๋˜๋Š” ์ •๋ถ€์ฑ„ ๋“ฑ ๊ณ ์ •์ˆ˜์ต ์ƒํ’ˆ" } for k, v in type_mapping.items(): print(f"{k:<7}: {v}") type_input = input("\nโœ… ์ˆ˜์ง‘ํ•  ์ข…๋ชฉ ์œ ํ˜•์„ ์ž…๋ ฅํ•˜์„ธ์š” (์˜ˆ: CS): ").strip().upper() if "type" not in df.columns: print("โ— 'type' ์ปฌ๋Ÿผ์ด ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ์ „์ฒด ๋ฐ์ดํ„ฐ ์‚ฌ์šฉ.") filtered_df = df.copy() else: filtered_df = df[df["type"] == type_input].copy() total_available = len(filtered_df) print(f"\n์ด {total_available}๊ฐœ์˜ ์ข…๋ชฉ์ด ์„ ํƒ๋œ ์œ ํ˜•({type_input})์— ํ•ด๋‹นํ•ฉ๋‹ˆ๋‹ค.") estimated_time = total_available * delay_sec / 60 print(f"โณ ์˜ˆ์ƒ ์†Œ์š” ์‹œ๊ฐ„: ์•ฝ {estimated_time:.1f}๋ถ„ (์š”๊ธˆ์ œ ๊ธฐ์ค€ {delay_sec}์ดˆ ๊ฐ„๊ฒฉ)") count_limit = input(f"๋ช‡ ๊ฐœ๋ฅผ ์ˆ˜์ง‘ํ•˜์‹œ๊ฒ ์Šต๋‹ˆ๊นŒ? [๊ธฐ๋ณธ: {total_available}]: ").strip() count = int(count_limit) if count_limit else total_available tickers = filtered_df.head(count) symbol_col = "symbol" if "symbol" in tickers.columns else "ticker" ticker_list = tickers[symbol_col].dropna().unique() # ์ข…๋ฃŒ ๋‚ ์งœ ๊ธฐ์ค€์œผ๋กœ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์„ธ๋ถ„ํ™” (์˜ˆ: data/CS/2024-12-31) dated_dir = os.path.join(save_dir, type_input, end) engine = DataCollectionEngine(data_dir=dated_dir) os.makedirs(dated_dir, exist_ok=True) failed = [] X_all, y_all = [], [] error_429_count = 0 for i, symbol in enumerate(ticker_list): try: print(f"[{i+1}/{len(ticker_list)}] {symbol} ์ˆ˜์ง‘ ์ค‘...") df = engine.collect(symbol.strip().upper(), start, end) X, y = build_dataset(df) X_all.append(X) y_all.append(y) error_429_count = 0 time.sleep(delay_sec) except Exception as e: print(f"โŒ {symbol} ์ˆ˜์ง‘ ์‹คํŒจ: {e}") failed.append(symbol) if "429" in str(e): error_429_count += 1 if error_429_count >= 2: print("โธ๏ธ ์—ฐ์†๋œ 429 ์˜ค๋ฅ˜ ๊ฐ์ง€ โ†’ 60์ดˆ ๋Œ€๊ธฐ ์ค‘...") time.sleep(60) if failed: print("\nโš ๏ธ ์ผ๋ถ€ ์ข…๋ชฉ ์ˆ˜์ง‘ ์‹คํŒจ:", failed) failed_df = pd.DataFrame(failed, columns=["ticker"]) failed_df.to_csv(os.path.join(dated_dir, "failed_tickers.csv"), index=False) print("๐Ÿ“„ ์‹คํŒจ ์ข…๋ชฉ ๋ชฉ๋ก ์ €์žฅ ์™„๋ฃŒ: failed_tickers.csv") else: print("\nโœ… ์ „์ฒด ์ข…๋ชฉ ์ˆ˜์ง‘ ์™„๋ฃŒ!") if X_all: X_total = pd.concat(X_all, ignore_index=True) y_total = pd.concat(y_all, ignore_index=True) X_total.to_csv(os.path.join(dated_dir, "X_total.csv"), index=False) y_total.to_csv(os.path.join(dated_dir, "y_total.csv"), index=False) print("โœ… ํ†ตํ•ฉ ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ ์ €์žฅ ์™„๋ฃŒ: X_total.csv, y_total.csv") else: print("โ— ์œ ํšจํ•œ CDS ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์–ด ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ์…‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.") if __name__ == "__main__": print("CDS ์ผ๊ด„ ์ˆ˜์ง‘์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค. ์•„๋ž˜ ์ •๋ณด๋ฅผ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”:") ticker_list_path = input("ํ‹ฐ์ปค ๋ฆฌ์ŠคํŠธ CSV ๊ฒฝ๋กœ [๊ธฐ๋ณธ: sp500_tickers.csv]: ").strip() or "sp500_tickers.csv" start = input("์‹œ์ž‘์ผ (YYYY-MM-DD) [๊ธฐ๋ณธ: 2023-01-01]: ").strip() or "2023-01-01" end = input("์ข…๋ฃŒ์ผ (YYYY-MM-DD) [๊ธฐ๋ณธ: 2024-12-31]: ").strip() or "2024-12-31" save_dir = input("์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ด๋ฆ„ [๊ธฐ๋ณธ: data]: ").strip() or "data" delay = input("API ํ˜ธ์ถœ ๊ฐ„๊ฒฉ (์ดˆ) [๊ธฐ๋ณธ: 13]: ").strip() delay_sec = int(delay) if delay else 13 collect_all_cds( ticker_list_path=ticker_list_path, start=start, end=end, save_dir=save_dir, delay_sec=delay_sec )