""" πŸ“ data_analysis_engine/train_model.py Complete Data Set(CDS)λ₯Ό 기반으둜 XGBoost λͺ¨λΈμ„ ν•™μŠ΅ ν›„ μ €μž₯ν•˜λŠ” 슀크립트 ν•˜μ΄νΌνŒŒλΌλ―Έν„° νŠœλ‹ (GridSearchCV) 포함 + ν•™μŠ΅ 둜그 μžλ™ μ €μž₯ + logloss 및 Precision@TopN 좜λ ₯ """ import os import pandas as pd from datetime import datetime from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss from sklearn.model_selection import GridSearchCV from xgboost import XGBClassifier from data_analysis_engine.dataset_builder import build_dataset_with_indicators def precision_at_top_n(y_true, y_score, top_n=50): top_indices = pd.Series(y_score).nlargest(top_n).index top_preds = pd.Series(y_true).iloc[top_indices] return top_preds.sum() / top_n def train_model(cds_dir: str, output_model_path: str = None): """ CDS λ””λ ‰ν† λ¦¬μ—μ„œ 데이터λ₯Ό 읽고 λͺ¨λΈ ν•™μŠ΅ + 졜적 νŒŒλΌλ―Έν„° 탐색 ν›„ μ €μž₯ """ X_total, y_total = [], [] for file in os.listdir(cds_dir): if not file.endswith("_ohlcv.csv"): continue df = pd.read_csv(os.path.join(cds_dir, file)) if df.empty: continue X, y = build_dataset_with_indicators(df) X_total.append(X) y_total.append(y) if not X_total: raise ValueError("CDS νŒŒμΌμ—μ„œ ν•™μŠ΅ κ°€λŠ₯ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.") X_all = pd.concat(X_total, ignore_index=True) y_all = pd.concat(y_total, ignore_index=True) # βœ… ν”Όμ²˜ 이름 보쑴 확인 X_all.columns = X_all.columns.astype(str) print("πŸ“Ž ν•™μŠ΅ ν”Όμ²˜:", list(X_all.columns)) # 클래슀 λΉ„μœ¨ 확인 class_counts = y_all.value_counts().to_dict() print(f"🎯 타깃 뢄포: {class_counts}") weight = class_counts.get(0, 1) / class_counts.get(1, 1) # κΈ°λ³Έ λͺ¨λΈ + νŠœλ‹ λŒ€μƒ μ„€μ • base_model = XGBClassifier( use_label_encoder=False, eval_metric='logloss', scale_pos_weight=weight, verbosity=0 ) param_grid = { 'max_depth': [3, 5], 'learning_rate': [0.1, 0.01], 'n_estimators': [100, 300] } grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) grid_search.fit(X_all, y_all) best_model = grid_search.best_estimator_ preds = best_model.predict(X_all) probas = best_model.predict_proba(X_all)[:, 1] acc = accuracy_score(y_all, preds) auc = roc_auc_score(y_all, probas) f1 = f1_score(y_all, preds) logloss_val = log_loss(y_all, probas) p_at_50 = precision_at_top_n(y_all, probas, top_n=50) print(f"πŸ“Š 정확도: {acc:.2%} | AUC: {auc:.3f} | F1: {f1:.3f} | LogLoss: {logloss_val:.4f} | P@50: {p_at_50:.3f} | μƒ˜ν”Œ 수: {len(X_all):,}") print(f"πŸ† 졜적 ν•˜μ΄νΌνŒŒλΌλ―Έν„°: {grid_search.best_params_}") # 둜그 μ €μž₯ date_tag = datetime.now().strftime("%Y-%m-%d") log_path = "data_analysis_engine/train_log.csv" log_exists = os.path.exists(log_path) log_df = pd.DataFrame([{ "date": date_tag, "accuracy": round(acc, 4), "auc": round(auc, 4), "f1_score": round(f1, 4), "logloss": round(logloss_val, 4), "precision@50": round(p_at_50, 4), "samples": len(X_all), "best_params": str(grid_search.best_params_), "cds_dir": cds_dir }]) log_df.to_csv(log_path, mode='a', index=False, header=not log_exists) # λͺ¨λΈ μ €μž₯ from xgboost import Booster model_dir = "data_analysis_engine/models" os.makedirs(model_dir, exist_ok=True) versioned_path = os.path.join(model_dir, f"model_{date_tag}.json") best_model.save_model(versioned_path) print(f"βœ… λͺ¨λΈ μ €μž₯ μ™„λ£Œ: {versioned_path}") print(f"πŸ“ ν•™μŠ΅ 둜그 μ €μž₯ μ™„λ£Œ: {log_path}") if __name__ == "__main__": cds_dir = input("CDS 데이터 디렉토리 [κΈ°λ³Έ: data]: ").strip() or "data" train_model(cds_dir)