crypto_prediction/main.py

183 lines
6.5 KiB
Python
Raw Normal View History

2022-09-05 13:30:09 +02:00
import pandas as pd
import time
from datetime import datetime, timezone, timedelta
import calendar
2022-09-05 15:46:48 +02:00
#from sklearnex import patch_sklearn # broken :(
#patch_sklearn()
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
2022-09-05 20:22:54 +02:00
from sklearn.preprocessing import StandardScaler
2022-09-05 13:30:09 +02:00
2022-09-05 20:22:54 +02:00
# Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664
2022-09-05 13:30:09 +02:00
rf_reg_grid = {
2022-09-05 13:50:10 +02:00
'bootstrap': [True],
'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt', 'log2'],
2022-09-05 13:30:09 +02:00
'min_samples_leaf': [1, 2, 4, 6],
2022-09-05 13:50:10 +02:00
'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],
'n_estimators': [80, 90, 100, 120, 150, 200]
2022-09-05 13:30:09 +02:00
}
best_params = {
'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
}
2022-09-05 15:46:48 +02:00
report_file = '/results/report.txt'
2022-09-05 13:30:09 +02:00
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
"""
start and end must be isoformat YYYY-MM-DD
We are using utc time zone
the maximum records is 1000 per each Binance API call
"""
2022-09-05 15:13:08 +02:00
# Source: https://stackoverflow.com/questions/66295187/how-do-i-get-all-the-prices-history-with-binance-api-for-a-crypto-using-python
2022-09-05 13:30:09 +02:00
df = pd.DataFrame()
if start is None:
print('start time must not be None')
exit(0)
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
if end is None:
dt = datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
end = int(utc_time.timestamp()) * 1000
return
else:
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
last_time = None
while len(df) == 0 or (last_time is not None and last_time < end):
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=1000'
if(len(df) == 0):
url += '&startTime=' + str(start)
else:
url += '&startTime=' + str(last_time)
url += '&endTime=' + str(end)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
dftmp = pd.DataFrame()
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
dftmp.reset_index(drop=True, inplace=True)
dftmp = dftmp.reindex(columns=column_names)
last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)
return df
def get_stored_data(symbol, interval, start, end):
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
df = pd.read_csv(
2022-09-05 15:46:48 +02:00
f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
#'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
2022-09-05 13:30:09 +02:00
parse_dates=['Opentime']
)
return df
def process_data(df):
df["sale_year"] = df.Opentime.dt.year
df["sale_month"] = df.Opentime.dt.month
df["sale_day"] = df.Opentime.dt.day
2022-09-05 15:13:08 +02:00
df["sale_hour"] = df.Opentime.dt.hour
df["minute"] = df.Opentime.dt.minute
2022-09-05 13:30:09 +02:00
df["sale_day_of_week"] = df.Opentime.dt.dayofweek
df["sale_day_of_year"] = df.Opentime.dt.dayofyear
2022-09-05 15:13:08 +02:00
df["quarter"] = df.Opentime.dt.quarter
2022-09-05 13:30:09 +02:00
df["price"] = df.Close
df.drop("Opentime", axis=1, inplace=True)
df.drop("Open", axis=1, inplace=True)
df.drop("High", axis=1, inplace=True)
df.drop("Low", axis=1, inplace=True)
df.drop("Close", axis=1, inplace=True)
df.drop("Volume", axis=1, inplace=True)
return df
def split_data(df, split_value):
df_split = int(len(df) * split_value)
df_train = df[:df_split]
df_val = df[df_split:]
X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
2022-09-05 20:22:54 +02:00
# Standardize features by removing the mean and scaling to unit variance.
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
2022-09-05 13:30:09 +02:00
return X_train, X_test, y_train, y_test
def find_best_hyperparameters_and_train(X_train, y_train):
global rf_reg_grid
2022-09-05 15:46:48 +02:00
global report_file
2022-09-05 13:30:09 +02:00
gs_rf_reg = GridSearchCV(
RandomForestRegressor(),
2022-09-05 15:46:48 +02:00
param_grid=rf_reg_grid,
2022-09-05 13:30:09 +02:00
cv=5,
refit=True,
n_jobs=-1
)
gs_rf_reg.fit(X_train, y_train)
2022-09-05 15:46:48 +02:00
report(report_file, f'Best hyperparams:\n{gs_rf_reg.best_params_}')
2022-09-05 13:30:09 +02:00
return gs_rf_reg
2022-09-05 15:46:48 +02:00
def report(f_name, report_data):
with open(f_name, 'a') as f:
f.write(report_data)
f.write('\n')
f.flush()
f.close()
2022-09-05 13:30:09 +02:00
def main():
2022-09-05 15:46:48 +02:00
global report_file
2022-09-05 13:30:09 +02:00
# Read data into panda dataframe
2022-09-05 15:13:08 +02:00
#df = get_data_and_store_csv('XMRUSDT', '1m', '2020-01-01', '2022-09-04')
df = get_stored_data('XMRUSDT', '1m', '2020-01-01', '2022-09-04')
2022-09-05 13:30:09 +02:00
# Prepare data for moddeling
df = process_data(df=df)
# Split data into train and test sets
2022-09-05 20:22:54 +02:00
X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99)
2022-09-05 13:30:09 +02:00
# Model the data
model = find_best_hyperparameters_and_train(
X_train=X_train,
y_train=y_train
)
2022-09-05 15:46:48 +02:00
# Do a report
report(report_file, f'Test data:\n{X_test}')
report(report_file, f'Prediction results:\n{model.predict(X_test)}')
report(report_file, f'Correct values:\n{y_test}')
report(report_file, f'Model scored:\n{model.score(X_test, y_test)}')
2022-09-05 13:30:09 +02:00
if __name__ == "__main__":
start_time = time.time()
2022-09-05 15:46:48 +02:00
prettified_time = datetime.now().strftime("%H:%M:%S")
report(report_file, f"\n Starting! {prettified_time}")
2022-09-05 13:30:09 +02:00
main()
end_time = time.time() - start_time
print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
2022-09-05 15:46:48 +02:00
report(report_file, f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')