From 01f2fc36b9bbbf55296858e53ace1456d6e7d2af Mon Sep 17 00:00:00 2001 From: Gasper Spagnolo Date: Mon, 5 Sep 2022 20:22:54 +0200 Subject: [PATCH] Update --- main.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index b008d77..260f293 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,9 @@ import calendar #patch_sklearn() from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor +from sklearn.preprocessing import StandardScaler +# Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664 rf_reg_grid = { 'bootstrap': [True], 'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], @@ -114,6 +116,11 @@ def split_data(df, split_value): X_train, y_train = df_train.drop("price", axis=1), df_train["price"] X_test, y_test = df_val.drop("price", axis = 1), df_val["price"] + # Standardize features by removing the mean and scaling to unit variance. + scale = StandardScaler() + x_train = scale.fit_transform(x_train) + x_test = scale.transform(x_test) + return X_train, X_test, y_train, y_test def find_best_hyperparameters_and_train(X_train, y_train): @@ -150,7 +157,7 @@ def main(): # Prepare data for moddeling df = process_data(df=df) # Split data into train and test sets - X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99999) + X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99) # Model the data model = find_best_hyperparameters_and_train(