This repository has been archived on 2025-06-07. You can view files and clone it, but cannot push or open issues or pull requests.
2025-05-06 21:23:04 +09:00

114 lines
3.9 KiB
Python

"""
📁 data_analysis_engine/train_model.py
Complete Data Set(CDS)를 기반으로 XGBoost 모델을 학습 후 저장하는 스크립트
하이퍼파라미터 튜닝 (GridSearchCV) 포함 + 학습 로그 자동 저장 + logloss 및 Precision@TopN 출력
"""
import os
import pandas as pd
from datetime import datetime
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from data_analysis_engine.dataset_builder import build_dataset_with_indicators
def precision_at_top_n(y_true, y_score, top_n=50):
top_indices = pd.Series(y_score).nlargest(top_n).index
top_preds = pd.Series(y_true).iloc[top_indices]
return top_preds.sum() / top_n
def train_model(cds_dir: str, output_model_path: str = None):
"""
CDS 디렉토리에서 데이터를 읽고 모델 학습 + 최적 파라미터 탐색 후 저장
"""
X_total, y_total = [], []
for file in os.listdir(cds_dir):
if not file.endswith("_ohlcv.csv"):
continue
df = pd.read_csv(os.path.join(cds_dir, file))
if df.empty:
continue
X, y = build_dataset_with_indicators(df)
X_total.append(X)
y_total.append(y)
if not X_total:
raise ValueError("CDS 파일에서 학습 가능한 데이터가 없습니다.")
X_all = pd.concat(X_total, ignore_index=True)
y_all = pd.concat(y_total, ignore_index=True)
# ✅ 피처 이름 보존 확인
X_all.columns = X_all.columns.astype(str)
print("📎 학습 피처:", list(X_all.columns))
# 클래스 비율 확인
class_counts = y_all.value_counts().to_dict()
print(f"🎯 타깃 분포: {class_counts}")
weight = class_counts.get(0, 1) / class_counts.get(1, 1)
# 기본 모델 + 튜닝 대상 설정
base_model = XGBClassifier(
use_label_encoder=False,
eval_metric='logloss',
scale_pos_weight=weight,
verbosity=0
)
param_grid = {
'max_depth': [3, 5],
'learning_rate': [0.1, 0.01],
'n_estimators': [100, 300]
}
grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_all, y_all)
best_model = grid_search.best_estimator_
preds = best_model.predict(X_all)
probas = best_model.predict_proba(X_all)[:, 1]
acc = accuracy_score(y_all, preds)
auc = roc_auc_score(y_all, probas)
f1 = f1_score(y_all, preds)
logloss_val = log_loss(y_all, probas)
p_at_50 = precision_at_top_n(y_all, probas, top_n=50)
print(f"📊 정확도: {acc:.2%} | AUC: {auc:.3f} | F1: {f1:.3f} | LogLoss: {logloss_val:.4f} | P@50: {p_at_50:.3f} | 샘플 수: {len(X_all):,}")
print(f"🏆 최적 하이퍼파라미터: {grid_search.best_params_}")
# 로그 저장
date_tag = datetime.now().strftime("%Y-%m-%d")
log_path = "data_analysis_engine/train_log.csv"
log_exists = os.path.exists(log_path)
log_df = pd.DataFrame([{
"date": date_tag,
"accuracy": round(acc, 4),
"auc": round(auc, 4),
"f1_score": round(f1, 4),
"logloss": round(logloss_val, 4),
"precision@50": round(p_at_50, 4),
"samples": len(X_all),
"best_params": str(grid_search.best_params_),
"cds_dir": cds_dir
}])
log_df.to_csv(log_path, mode='a', index=False, header=not log_exists)
# 모델 저장
from xgboost import Booster
model_dir = "data_analysis_engine/models"
os.makedirs(model_dir, exist_ok=True)
versioned_path = os.path.join(model_dir, f"model_{date_tag}.json")
best_model.save_model(versioned_path)
print(f"✅ 모델 저장 완료: {versioned_path}")
print(f"📝 학습 로그 저장 완료: {log_path}")
if __name__ == "__main__":
cds_dir = input("CDS 데이터 디렉토리 [기본: data]: ").strip() or "data"
train_model(cds_dir)