Update

2022-09-05 20:22:54 +02:00 · 2022-09-05 20:22:54 +02:00 · 01f2fc36b9
parent 0ca4a677c4
commit 01f2fc36b9
1 changed files with 8 additions and 1 deletions
--- a/main.py
+++ b/main.py
@ -6,7 +6,9 @@ import calendar
 #patch_sklearn()
 from sklearn.model_selection import GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.preprocessing import StandardScaler

+# Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664
 rf_reg_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
@ -114,6 +116,11 @@ def split_data(df, split_value):
    X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
    X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]

+    # Standardize features by removing the mean and scaling to unit variance.
+    scale = StandardScaler()
+    x_train = scale.fit_transform(x_train)
+    x_test = scale.transform(x_test) 
+
    return X_train, X_test, y_train, y_test

 def find_best_hyperparameters_and_train(X_train, y_train):
@ -150,7 +157,7 @@ def main():
    # Prepare data for moddeling
    df = process_data(df=df)
    # Split data into train and test sets
-    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99999) 
+    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99) 

    # Model the data
    model = find_best_hyperparameters_and_train(