#!/usr/bin/env python3 import argparse import json import logging from pathlib import Path import time import numpy as np import pandas as pd import statsmodels.api as sm def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info("Training statsmodels linear regression...") # Generate synthetic data np.random.seed(42) n_samples = 1000 n_features = 5 X = np.random.randn(n_samples, n_features) # True coefficients true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0]) noise = np.random.randn(n_samples) * 0.1 y = X @ true_coef + noise # Create DataFrame feature_names = [f"feature_{i}" for i in range(n_features)] X_df = pd.DataFrame(X, columns=feature_names) y_series = pd.Series(y, name="target") # Add constant for intercept X_with_const = sm.add_constant(X_df) # Fit model model = sm.OLS(y_series, X_with_const).fit() logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}") # Save results results = { "model_type": "LinearRegression", "n_samples": n_samples, "n_features": n_features, "r_squared": float(model.rsquared), "adj_r_squared": float(model.rsquared_adj), "f_statistic": float(model.fvalue), "f_pvalue": float(model.f_pvalue), "coefficients": model.params.to_dict(), "standard_errors": model.bse.to_dict(), "p_values": model.pvalues.to_dict(), } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model summary with open(output_dir / "model_summary.txt", "w") as f: f.write(str(model.summary())) logger.info("Results and model summary saved successfully!") if __name__ == "__main__": main()