2022-09-05 13:30:09 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
import time
|
|
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
import calendar
|
|
|
|
|
|
|
|
rf_reg_grid = {
|
2022-09-05 13:50:10 +02:00
|
|
|
'bootstrap': [True],
|
|
|
|
'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
|
|
|
|
'max_features': ['auto', 'sqrt', 'log2'],
|
2022-09-05 13:30:09 +02:00
|
|
|
'min_samples_leaf': [1, 2, 4, 6],
|
2022-09-05 13:50:10 +02:00
|
|
|
'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],
|
|
|
|
'n_estimators': [80, 90, 100, 120, 150, 200]
|
2022-09-05 13:30:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
best_params = {
|
|
|
|
'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
|
|
|
|
}
|
|
|
|
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
|
|
|
|
"""
|
|
|
|
start and end must be isoformat YYYY-MM-DD
|
|
|
|
We are using utc time zone
|
|
|
|
the maximum records is 1000 per each Binance API call
|
|
|
|
"""
|
|
|
|
|
|
|
|
df = pd.DataFrame()
|
|
|
|
|
|
|
|
if start is None:
|
|
|
|
print('start time must not be None')
|
|
|
|
exit(0)
|
|
|
|
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
|
|
|
|
|
|
|
if end is None:
|
|
|
|
dt = datetime.now(timezone.utc)
|
|
|
|
utc_time = dt.replace(tzinfo=timezone.utc)
|
|
|
|
end = int(utc_time.timestamp()) * 1000
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
|
|
|
last_time = None
|
|
|
|
|
|
|
|
while len(df) == 0 or (last_time is not None and last_time < end):
|
|
|
|
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
|
|
|
|
symbol + '&interval=' + interval + '&limit=1000'
|
|
|
|
if(len(df) == 0):
|
|
|
|
url += '&startTime=' + str(start)
|
|
|
|
else:
|
|
|
|
url += '&startTime=' + str(last_time)
|
|
|
|
url += '&endTime=' + str(end)
|
|
|
|
|
|
|
|
df2 = pd.read_json(url)
|
|
|
|
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
|
|
|
|
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
|
|
|
|
|
|
|
|
dftmp = pd.DataFrame()
|
|
|
|
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
|
|
|
|
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
|
|
|
|
dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
|
|
|
|
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
|
|
|
|
column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
|
|
|
|
|
|
|
|
dftmp.reset_index(drop=True, inplace=True)
|
|
|
|
dftmp = dftmp.reindex(columns=column_names)
|
|
|
|
|
|
|
|
last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
|
|
|
|
|
|
|
|
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
|
|
|
|
|
|
|
|
df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)
|
|
|
|
return df
|
|
|
|
|
|
|
|
def get_stored_data(symbol, interval, start, end):
|
|
|
|
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
|
|
|
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
|
|
|
df = pd.read_csv(
|
|
|
|
f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
|
|
|
|
parse_dates=['Opentime']
|
|
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
|
|
def process_data(df):
|
|
|
|
df["sale_year"] = df.Opentime.dt.year
|
|
|
|
df["sale_month"] = df.Opentime.dt.month
|
|
|
|
df["sale_day"] = df.Opentime.dt.day
|
|
|
|
df["sale_day_of_week"] = df.Opentime.dt.dayofweek
|
|
|
|
df["sale_day_of_year"] = df.Opentime.dt.dayofyear
|
|
|
|
df["price"] = df.Close
|
|
|
|
|
|
|
|
df.drop("Opentime", axis=1, inplace=True)
|
|
|
|
df.drop("Open", axis=1, inplace=True)
|
|
|
|
df.drop("High", axis=1, inplace=True)
|
|
|
|
df.drop("Low", axis=1, inplace=True)
|
|
|
|
df.drop("Close", axis=1, inplace=True)
|
|
|
|
df.drop("Volume", axis=1, inplace=True)
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
def split_data(df, split_value):
|
|
|
|
df_split = int(len(df) * split_value)
|
|
|
|
|
|
|
|
df_train = df[:df_split]
|
|
|
|
df_val = df[df_split:]
|
|
|
|
|
|
|
|
X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
|
|
|
|
X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
|
|
|
|
|
|
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
|
|
def find_best_hyperparameters_and_train(X_train, y_train):
|
|
|
|
global rf_reg_grid
|
|
|
|
|
|
|
|
gs_rf_reg = GridSearchCV(
|
|
|
|
RandomForestRegressor(),
|
|
|
|
param_grid=rf_reg_grid,
|
|
|
|
cv=5,
|
|
|
|
refit=True,
|
|
|
|
n_jobs=-1
|
|
|
|
)
|
|
|
|
|
|
|
|
gs_rf_reg.fit(X_train, y_train)
|
|
|
|
|
|
|
|
print("Best hyperparameters: ", gs_rf_reg.best_params_)
|
|
|
|
|
|
|
|
return gs_rf_reg
|
|
|
|
|
|
|
|
def main():
|
|
|
|
global models
|
|
|
|
# Read data into panda dataframe
|
|
|
|
#df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
|
|
|
|
df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
|
|
|
|
# Prepare data for moddeling
|
|
|
|
df = process_data(df=df)
|
|
|
|
# Split data into train and test sets
|
2022-09-05 13:50:10 +02:00
|
|
|
X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.95)
|
2022-09-05 13:30:09 +02:00
|
|
|
|
|
|
|
# Model the data
|
|
|
|
model = find_best_hyperparameters_and_train(
|
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train
|
|
|
|
)
|
|
|
|
|
|
|
|
# Score our modello
|
|
|
|
print('Model scored: ', model.score(X_test, y_test))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
start_time = time.time()
|
|
|
|
main()
|
|
|
|
end_time = time.time() - start_time
|
|
|
|
print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
|