crypto_prediction/main.py

import pandas as pd
import time
from datetime import datetime, timezone, timedelta
import calendar
#from sklearnex import patch_sklearn # broken :(
#patch_sklearn()
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664
rf_reg_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4, 6],
    'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],
    'n_estimators': [80, 90, 100, 120, 150, 200]
}

best_params = {
    'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
}

report_file = '/results/report.txt'

def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
    """
    start and end must be isoformat YYYY-MM-DD
    We are using utc time zone
    the maximum records is 1000 per each Binance API call
    """
    # Source: https://stackoverflow.com/questions/66295187/how-do-i-get-all-the-prices-history-with-binance-api-for-a-crypto-using-python
    df = pd.DataFrame()

    if start is None:
        print('start time must not be None')
        exit(0)
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000

    if end is None:
        dt = datetime.now(timezone.utc)
        utc_time = dt.replace(tzinfo=timezone.utc)
        end = int(utc_time.timestamp()) * 1000
        return
    else:
        end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    last_time = None

    while len(df) == 0 or (last_time is not None and last_time < end):
        url = 'https://api.binance.com/api/v3/klines?symbol=' + \
              symbol + '&interval=' + interval + '&limit=1000'
        if(len(df) == 0):
            url += '&startTime=' + str(start)
        else:
            url += '&startTime=' + str(last_time)
        url += '&endTime=' + str(end)

        df2 = pd.read_json(url)
        df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
                       'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']

        dftmp = pd.DataFrame()
        dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
        dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
        dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
                      'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
        column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]

        dftmp.reset_index(drop=True, inplace=True)
        dftmp = dftmp.reindex(columns=column_names)

        last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)

        df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)

    df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',  index=False)
    return df

def get_stored_data(symbol, interval, start, end):
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
    end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    df = pd.read_csv(
        f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
        #'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
        parse_dates=['Opentime']
    )
    return df

def process_data(df):
    df["sale_year"] = df.Opentime.dt.year    
    df["sale_month"] = df.Opentime.dt.month
    df["sale_day"] = df.Opentime.dt.day
    df["sale_hour"] = df.Opentime.dt.hour
    df["minute"] = df.Opentime.dt.minute
    df["sale_day_of_week"] = df.Opentime.dt.dayofweek
    df["sale_day_of_year"] = df.Opentime.dt.dayofyear
    df["quarter"] =  df.Opentime.dt.quarter
    df["price"] = df.Close

    df.drop("Opentime", axis=1, inplace=True)
    df.drop("Open", axis=1, inplace=True)
    df.drop("High", axis=1, inplace=True)
    df.drop("Low", axis=1, inplace=True)
    df.drop("Close", axis=1, inplace=True)
    df.drop("Volume", axis=1, inplace=True)

    return df

def split_data(df, split_value):
    df_split = int(len(df) * split_value)
    
    df_train = df[:df_split]
    df_val = df[df_split:]

    X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
    X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]

    # Standardize features by removing the mean and scaling to unit variance.
    scale = StandardScaler()
    x_train = scale.fit_transform(x_train)
    x_test = scale.transform(x_test) 

    return X_train, X_test, y_train, y_test

def find_best_hyperparameters_and_train(X_train, y_train):
    global rf_reg_grid
    global report_file

    gs_rf_reg = GridSearchCV(
        RandomForestRegressor(),
        param_grid=rf_reg_grid,
        cv=5,
        refit=True,
        n_jobs=-1
    )

    gs_rf_reg.fit(X_train, y_train)

    report(report_file, f'Best hyperparams:\n{gs_rf_reg.best_params_}')

    return gs_rf_reg

def report(f_name, report_data):
    with open(f_name, 'a') as f:
        f.write(report_data)
        f.write('\n')
        f.flush()
        f.close()


def main():
    global report_file
    # Read data into panda dataframe
    #df = get_data_and_store_csv('XMRUSDT', '1m', '2020-01-01', '2022-09-04')
    df = get_stored_data('XMRUSDT', '1m', '2020-01-01', '2022-09-04')
    # Prepare data for moddeling
    df = process_data(df=df)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99) 

    # Model the data
    model = find_best_hyperparameters_and_train(
         X_train=X_train,
         y_train=y_train
    )

    # Do a report
    report(report_file, f'Test data:\n{X_test}')
    report(report_file, f'Prediction results:\n{model.predict(X_test)}')
    report(report_file, f'Correct values:\n{y_test}')
    report(report_file, f'Model scored:\n{model.score(X_test, y_test)}')


if __name__ == "__main__":
    start_time = time.time()
    prettified_time = datetime.now().strftime("%H:%M:%S")
    report(report_file, f"\n Starting! {prettified_time}")
    main()
    end_time = time.time() - start_time
    print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
    report(report_file, f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
first commit 2022-09-05 13:30:09 +02:00			`import pandas as pd`
			`import time`
			`from datetime import datetime, timezone, timedelta`
			`import calendar`
Lets go 2022-09-05 15:46:48 +02:00			`#from sklearnex import patch_sklearn # broken :(`
			`#patch_sklearn()`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.ensemble import RandomForestRegressor`
Update 2022-09-05 20:22:54 +02:00			`from sklearn.preprocessing import StandardScaler`
first commit 2022-09-05 13:30:09 +02:00
Update 2022-09-05 20:22:54 +02:00			`# Improving dataset and modelling: https://medium.com/@maryamuzakariya/project-predict-stock-prices-using-random-forest-regression-model-in-python-fbe4edf01664`
first commit 2022-09-05 13:30:09 +02:00			`rf_reg_grid = {`
More train 2022-09-05 13:50:10 +02:00			`'bootstrap': [True],`
			`'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],`
			`'max_features': ['auto', 'sqrt', 'log2'],`
first commit 2022-09-05 13:30:09 +02:00			`'min_samples_leaf': [1, 2, 4, 6],`
More train 2022-09-05 13:50:10 +02:00			`'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],`
			`'n_estimators': [80, 90, 100, 120, 150, 200]`
first commit 2022-09-05 13:30:09 +02:00			`}`

			`best_params = {`
			`'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]`
			`}`
Lets go 2022-09-05 15:46:48 +02:00
			`report_file = '/results/report.txt'`

first commit 2022-09-05 13:30:09 +02:00			`def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):`
			`"""`
			`start and end must be isoformat YYYY-MM-DD`
			`We are using utc time zone`
			`the maximum records is 1000 per each Binance API call`
			`"""`
Update 2022-09-05 15:13:08 +02:00			`# Source: https://stackoverflow.com/questions/66295187/how-do-i-get-all-the-prices-history-with-binance-api-for-a-crypto-using-python`
first commit 2022-09-05 13:30:09 +02:00			`df = pd.DataFrame()`

			`if start is None:`
			`print('start time must not be None')`
			`exit(0)`
			`start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000`

			`if end is None:`
			`dt = datetime.now(timezone.utc)`
			`utc_time = dt.replace(tzinfo=timezone.utc)`
			`end = int(utc_time.timestamp()) * 1000`
			`return`
			`else:`
			`end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000`
			`last_time = None`

			`while len(df) == 0 or (last_time is not None and last_time < end):`
			`url = 'https://api.binance.com/api/v3/klines?symbol=' + \`
			`symbol + '&interval=' + interval + '&limit=1000'`
			`if(len(df) == 0):`
			`url += '&startTime=' + str(start)`
			`else:`
			`url += '&startTime=' + str(last_time)`
			`url += '&endTime=' + str(end)`

			`df2 = pd.read_json(url)`
			`df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',`
			`'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']`

			`dftmp = pd.DataFrame()`
			`dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)`
			`dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')`
			`dftmp = dftmp.drop(['Quote asset volume', 'Closetime',`
			`'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)`
			`column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]`

			`dftmp.reset_index(drop=True, inplace=True)`
			`dftmp = dftmp.reindex(columns=column_names)`

			`last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)`

			`df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)`

			`df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)`
			`return df`

			`def get_stored_data(symbol, interval, start, end):`
			`start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000`
			`end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000`
			`df = pd.read_csv(`
Lets go 2022-09-05 15:46:48 +02:00			`f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',`
			`#'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',`
first commit 2022-09-05 13:30:09 +02:00			`parse_dates=['Opentime']`
			`)`
			`return df`

			`def process_data(df):`
			`df["sale_year"] = df.Opentime.dt.year`
			`df["sale_month"] = df.Opentime.dt.month`
			`df["sale_day"] = df.Opentime.dt.day`
Update 2022-09-05 15:13:08 +02:00			`df["sale_hour"] = df.Opentime.dt.hour`
			`df["minute"] = df.Opentime.dt.minute`
first commit 2022-09-05 13:30:09 +02:00			`df["sale_day_of_week"] = df.Opentime.dt.dayofweek`
			`df["sale_day_of_year"] = df.Opentime.dt.dayofyear`
Update 2022-09-05 15:13:08 +02:00			`df["quarter"] = df.Opentime.dt.quarter`
first commit 2022-09-05 13:30:09 +02:00			`df["price"] = df.Close`

			`df.drop("Opentime", axis=1, inplace=True)`
			`df.drop("Open", axis=1, inplace=True)`
			`df.drop("High", axis=1, inplace=True)`
			`df.drop("Low", axis=1, inplace=True)`
			`df.drop("Close", axis=1, inplace=True)`
			`df.drop("Volume", axis=1, inplace=True)`

			`return df`

			`def split_data(df, split_value):`
			`df_split = int(len(df) * split_value)`

			`df_train = df[:df_split]`
			`df_val = df[df_split:]`

			`X_train, y_train = df_train.drop("price", axis=1), df_train["price"]`
			`X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]`

Update 2022-09-05 20:22:54 +02:00			`# Standardize features by removing the mean and scaling to unit variance.`
			`scale = StandardScaler()`
			`x_train = scale.fit_transform(x_train)`
			`x_test = scale.transform(x_test)`

first commit 2022-09-05 13:30:09 +02:00			`return X_train, X_test, y_train, y_test`

			`def find_best_hyperparameters_and_train(X_train, y_train):`
			`global rf_reg_grid`
Lets go 2022-09-05 15:46:48 +02:00			`global report_file`
first commit 2022-09-05 13:30:09 +02:00
			`gs_rf_reg = GridSearchCV(`
			`RandomForestRegressor(),`
Lets go 2022-09-05 15:46:48 +02:00			`param_grid=rf_reg_grid,`
first commit 2022-09-05 13:30:09 +02:00			`cv=5,`
			`refit=True,`
			`n_jobs=-1`
			`)`

			`gs_rf_reg.fit(X_train, y_train)`

Lets go 2022-09-05 15:46:48 +02:00			`report(report_file, f'Best hyperparams:\n{gs_rf_reg.best_params_}')`
first commit 2022-09-05 13:30:09 +02:00
			`return gs_rf_reg`

Lets go 2022-09-05 15:46:48 +02:00			`def report(f_name, report_data):`
			`with open(f_name, 'a') as f:`
			`f.write(report_data)`
			`f.write('\n')`
			`f.flush()`
			`f.close()`


first commit 2022-09-05 13:30:09 +02:00			`def main():`
Lets go 2022-09-05 15:46:48 +02:00			`global report_file`
first commit 2022-09-05 13:30:09 +02:00			`# Read data into panda dataframe`
Update 2022-09-05 15:13:08 +02:00			`#df = get_data_and_store_csv('XMRUSDT', '1m', '2020-01-01', '2022-09-04')`
			`df = get_stored_data('XMRUSDT', '1m', '2020-01-01', '2022-09-04')`
first commit 2022-09-05 13:30:09 +02:00			`# Prepare data for moddeling`
			`df = process_data(df=df)`
			`# Split data into train and test sets`
Update 2022-09-05 20:22:54 +02:00			`X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.99)`
first commit 2022-09-05 13:30:09 +02:00
			`# Model the data`
			`model = find_best_hyperparameters_and_train(`
			`X_train=X_train,`
			`y_train=y_train`
			`)`

Lets go 2022-09-05 15:46:48 +02:00			`# Do a report`
			`report(report_file, f'Test data:\n{X_test}')`
			`report(report_file, f'Prediction results:\n{model.predict(X_test)}')`
			`report(report_file, f'Correct values:\n{y_test}')`
			`report(report_file, f'Model scored:\n{model.score(X_test, y_test)}')`
first commit 2022-09-05 13:30:09 +02:00

			`if __name__ == "__main__":`
			`start_time = time.time()`
Lets go 2022-09-05 15:46:48 +02:00			`prettified_time = datetime.now().strftime("%H:%M:%S")`
			`report(report_file, f"\n Starting! {prettified_time}")`
first commit 2022-09-05 13:30:09 +02:00			`main()`
			`end_time = time.time() - start_time`
			`print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')`
Lets go 2022-09-05 15:46:48 +02:00			`report(report_file, f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')`