import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor import time from datetime import datetime, timezone, timedelta import calendar rf_reg_grid = { 'bootstrap': [True], 'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 2, 4, 6], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], 'n_estimators': [80, 90, 100, 120, 150, 200] } best_params = { 'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100] } def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000): """ start and end must be isoformat YYYY-MM-DD We are using utc time zone the maximum records is 1000 per each Binance API call """ # Source: https://stackoverflow.com/questions/66295187/how-do-i-get-all-the-prices-history-with-binance-api-for-a-crypto-using-python df = pd.DataFrame() if start is None: print('start time must not be None') exit(0) start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000 if end is None: dt = datetime.now(timezone.utc) utc_time = dt.replace(tzinfo=timezone.utc) end = int(utc_time.timestamp()) * 1000 return else: end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000 last_time = None while len(df) == 0 or (last_time is not None and last_time < end): url = 'https://api.binance.com/api/v3/klines?symbol=' + \ symbol + '&interval=' + interval + '&limit=1000' if(len(df) == 0): url += '&startTime=' + str(start) else: url += '&startTime=' + str(last_time) url += '&endTime=' + str(end) df2 = pd.read_json(url) df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime', 'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'] dftmp = pd.DataFrame() dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None) dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms') dftmp = dftmp.drop(['Quote asset volume', 'Closetime', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1) column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"] dftmp.reset_index(drop=True, inplace=True) dftmp = dftmp.reindex(columns=column_names) last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1) df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None) df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False) return df def get_stored_data(symbol, interval, start, end): start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000 end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000 df = pd.read_csv( #f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', parse_dates=['Opentime'] ) return df def process_data(df): df["sale_year"] = df.Opentime.dt.year df["sale_month"] = df.Opentime.dt.month df["sale_day"] = df.Opentime.dt.day df["sale_hour"] = df.Opentime.dt.hour df["minute"] = df.Opentime.dt.minute df["sale_day_of_week"] = df.Opentime.dt.dayofweek df["sale_day_of_year"] = df.Opentime.dt.dayofyear df["quarter"] = df.Opentime.dt.quarter df["price"] = df.Close df.drop("Opentime", axis=1, inplace=True) df.drop("Open", axis=1, inplace=True) df.drop("High", axis=1, inplace=True) df.drop("Low", axis=1, inplace=True) df.drop("Close", axis=1, inplace=True) df.drop("Volume", axis=1, inplace=True) return df def split_data(df, split_value): df_split = int(len(df) * split_value) df_train = df[:df_split] df_val = df[df_split:] X_train, y_train = df_train.drop("price", axis=1), df_train["price"] X_test, y_test = df_val.drop("price", axis = 1), df_val["price"] return X_train, X_test, y_train, y_test def find_best_hyperparameters_and_train(X_train, y_train): global rf_reg_grid gs_rf_reg = GridSearchCV( RandomForestRegressor(), param_grid=best_params, cv=5, refit=True, n_jobs=-1 ) gs_rf_reg.fit(X_train, y_train) print("Best hyperparameters: ", gs_rf_reg.best_params_) return gs_rf_reg def main(): global models # Read data into panda dataframe #df = get_data_and_store_csv('XMRUSDT', '1m', '2020-01-01', '2022-09-04') df = get_stored_data('XMRUSDT', '1m', '2020-01-01', '2022-09-04') # Prepare data for moddeling df = process_data(df=df) # Split data into train and test sets X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.9999) # Model the data model = find_best_hyperparameters_and_train( X_train=X_train, y_train=y_train ) # Score our modello print("Test data:\n", X_test) print("Prediction results:\n", model.predict(X_test)) print("Correct values:\n", y_test) print('Model scored:\n', model.score(X_test, y_test)) if __name__ == "__main__": start_time = time.time() main() end_time = time.time() - start_time print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')