import pandas as pd import time from datetime import datetime, timezone, timedelta import calendar #from sklearnex import patch_sklearn # broken :( #patch_sklearn() from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import StandardScaler # Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664 rf_reg_grid = { 'bootstrap': [True], 'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 2, 4, 6], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], 'n_estimators': [80, 90, 100, 120, 150, 200] } best_params = { 'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100] } report_file = '/results/report.txt' def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000): """ start and end must be isoformat YYYY-MM-DD We are using utc time zone the maximum records is 1000 per each Binance API call """ # Source: https://stackoverflow.com/questions/66295187/how-do-i-get-all-the-prices-history-with-binance-api-for-a-crypto-using-python df = pd.DataFrame() if start is None: print('start time must not be None') exit(0) start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000 if end is None: dt = datetime.now(timezone.utc) utc_time = dt.replace(tzinfo=timezone.utc) end = int(utc_time.timestamp()) * 1000 return else: end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000 last_time = None while len(df) == 0 or (last_time is not None and last_time < end): url = 'https://api.binance.com/api/v3/klines?symbol=' + \ symbol + '&interval=' + interval + '&limit=1000' if(len(df) == 0): url += '&startTime=' + str(start) else: url += '&startTime=' + str(last_time) url += '&endTime=' + str(end) df2 = pd.read_json(url) df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime', 'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'] dftmp = pd.DataFrame() dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None) dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms') dftmp = dftmp.drop(['Quote asset volume', 'Closetime', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1) column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"] dftmp.reset_index(drop=True, inplace=True) dftmp = dftmp.reindex(columns=column_names) last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1) df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None) df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False) return df def get_stored_data(symbol, interval, start, end): start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000 end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000 df = pd.read_csv( f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', #'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', parse_dates=['Opentime'] ) return df def process_data(df): df["sale_year"] = df.Opentime.dt.year df["sale_month"] = df.Opentime.dt.month df["sale_day"] = df.Opentime.dt.day df["sale_hour"] = df.Opentime.dt.hour df["minute"] = df.Opentime.dt.minute df["sale_day_of_week"] = df.Opentime.dt.dayofweek df["sale_day_of_year"] = df.Opentime.dt.dayofyear df["quarter"] = df.Opentime.dt.quarter df["price"] = df.Close df.drop("Opentime", axis=1, inplace=True) df.drop("Open", axis=1, inplace=True) df.drop("High", axis=1, inplace=True) df.drop("Low", axis=1, inplace=True) df.drop("Close", axis=1, inplace=True) df.drop("Volume", axis=1, inplace=True) return df def split_data(df, split_value): df_split = int(len(df) * split_value) df_train = df[:df_split] df_val = df[df_split:] X_train, y_train = df_train.drop("price", axis=1), df_train["price"] X_test, y_test = df_val.drop("price", axis = 1), df_val["price"] # Standardize features by removing the mean and scaling to unit variance. scale = StandardScaler() X_train = scale.fit_transform(X_train) X_test = scale.transform(X_test) return X_train, X_test, y_train, y_test def find_best_hyperparameters_and_train(X_train, y_train): global rf_reg_grid global report_file gs_rf_reg = GridSearchCV( RandomForestRegressor(), param_grid=rf_reg_grid, cv=5, refit=True, n_jobs=-1 ) gs_rf_reg.fit(X_train, y_train) report(report_file, f'Best hyperparams:\n{gs_rf_reg.best_params_}') return gs_rf_reg def report(f_name, report_data): with open(f_name, 'a') as f: f.write(report_data) f.write('\n') f.flush() f.close() def main(): global report_file # Read data into panda dataframe #df = get_data_and_store_csv('XMRUSDT', '1m', '2020-01-01', '2022-09-04') df = get_stored_data('XMRUSDT', '1m', '2020-01-01', '2022-09-04') # Prepare data for moddeling df = process_data(df=df) # Split data into train and test sets X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99) # Model the data model = find_best_hyperparameters_and_train( X_train=X_train, y_train=y_train ) # Do a report report(report_file, f'Test data:\n{X_test}') report(report_file, f'Prediction results:\n{model.predict(X_test)}') report(report_file, f'Correct values:\n{y_test}') report(report_file, f'Model scored:\n{model.score(X_test, y_test)}') if __name__ == "__main__": start_time = time.time() prettified_time = datetime.now().strftime("%H:%M:%S") report(report_file, f"\n Starting! {prettified_time}") main() end_time = time.time() - start_time print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s') report(report_file, f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')