crypto_prediction/main.py

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import time
from datetime import datetime, timezone, timedelta
import calendar

rf_reg_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4, 6],
    'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],
    'n_estimators': [80, 90, 100, 120, 150, 200]
}

best_params = {
    'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
}
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
    """
    start and end must be isoformat YYYY-MM-DD
    We are using utc time zone
    the maximum records is 1000 per each Binance API call
    """

    df = pd.DataFrame()

    if start is None:
        print('start time must not be None')
        exit(0)
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000

    if end is None:
        dt = datetime.now(timezone.utc)
        utc_time = dt.replace(tzinfo=timezone.utc)
        end = int(utc_time.timestamp()) * 1000
        return
    else:
        end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    last_time = None

    while len(df) == 0 or (last_time is not None and last_time < end):
        url = 'https://api.binance.com/api/v3/klines?symbol=' + \
              symbol + '&interval=' + interval + '&limit=1000'
        if(len(df) == 0):
            url += '&startTime=' + str(start)
        else:
            url += '&startTime=' + str(last_time)
        url += '&endTime=' + str(end)

        df2 = pd.read_json(url)
        df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
                       'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']

        dftmp = pd.DataFrame()
        dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
        dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
        dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
                      'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
        column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]

        dftmp.reset_index(drop=True, inplace=True)
        dftmp = dftmp.reindex(columns=column_names)

        last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)

        df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)

    df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',  index=False)
    return df

def get_stored_data(symbol, interval, start, end):
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
    end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    df = pd.read_csv(
        f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
        parse_dates=['Opentime']
    )
    return df

def process_data(df):
    df["sale_year"] = df.Opentime.dt.year    
    df["sale_month"] = df.Opentime.dt.month
    df["sale_day"] = df.Opentime.dt.day
    df["sale_day_of_week"] = df.Opentime.dt.dayofweek
    df["sale_day_of_year"] = df.Opentime.dt.dayofyear
    df["price"] = df.Close

    df.drop("Opentime", axis=1, inplace=True)
    df.drop("Open", axis=1, inplace=True)
    df.drop("High", axis=1, inplace=True)
    df.drop("Low", axis=1, inplace=True)
    df.drop("Close", axis=1, inplace=True)
    df.drop("Volume", axis=1, inplace=True)

    return df

def split_data(df, split_value):
    df_split = int(len(df) * split_value)
    
    df_train = df[:df_split]
    df_val = df[df_split:]

    X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
    X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]

    return X_train, X_test, y_train, y_test

def find_best_hyperparameters_and_train(X_train, y_train):
    global rf_reg_grid

    gs_rf_reg = GridSearchCV(
        RandomForestRegressor(),
        param_grid=rf_reg_grid,
        cv=5,
        refit=True,
        n_jobs=-1
    )

    gs_rf_reg.fit(X_train, y_train)

    print("Best hyperparameters: ", gs_rf_reg.best_params_)

    return gs_rf_reg

def main():
    global models
    # Read data into panda dataframe
    #df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
    df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
    # Prepare data for moddeling
    df = process_data(df=df)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.95) 

    # Model the data
    model = find_best_hyperparameters_and_train(
         X_train=X_train,
         y_train=y_train
    )

    # Score our modello
    print('Model scored: ', model.score(X_test, y_test))


if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time() - start_time
    print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
first commit 2022-09-05 13:30:09 +02:00			`import pandas as pd`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.ensemble import RandomForestRegressor`
			`import time`
			`from datetime import datetime, timezone, timedelta`
			`import calendar`

			`rf_reg_grid = {`
More train 2022-09-05 13:50:10 +02:00			`'bootstrap': [True],`
			`'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],`
			`'max_features': ['auto', 'sqrt', 'log2'],`
first commit 2022-09-05 13:30:09 +02:00			`'min_samples_leaf': [1, 2, 4, 6],`
More train 2022-09-05 13:50:10 +02:00			`'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],`
			`'n_estimators': [80, 90, 100, 120, 150, 200]`
first commit 2022-09-05 13:30:09 +02:00			`}`

			`best_params = {`
			`'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]`
			`}`
			`def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):`
			`"""`
			`start and end must be isoformat YYYY-MM-DD`
			`We are using utc time zone`
			`the maximum records is 1000 per each Binance API call`
			`"""`

			`df = pd.DataFrame()`

			`if start is None:`
			`print('start time must not be None')`
			`exit(0)`
			`start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000`

			`if end is None:`
			`dt = datetime.now(timezone.utc)`
			`utc_time = dt.replace(tzinfo=timezone.utc)`
			`end = int(utc_time.timestamp()) * 1000`
			`return`
			`else:`
			`end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000`
			`last_time = None`

			`while len(df) == 0 or (last_time is not None and last_time < end):`
			`url = 'https://api.binance.com/api/v3/klines?symbol=' + \`
			`symbol + '&interval=' + interval + '&limit=1000'`
			`if(len(df) == 0):`
			`url += '&startTime=' + str(start)`
			`else:`
			`url += '&startTime=' + str(last_time)`
			`url += '&endTime=' + str(end)`

			`df2 = pd.read_json(url)`
			`df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',`
			`'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']`

			`dftmp = pd.DataFrame()`
			`dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)`
			`dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')`
			`dftmp = dftmp.drop(['Quote asset volume', 'Closetime',`
			`'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)`
			`column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]`

			`dftmp.reset_index(drop=True, inplace=True)`
			`dftmp = dftmp.reindex(columns=column_names)`

			`last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)`

			`df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)`

			`df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)`
			`return df`

			`def get_stored_data(symbol, interval, start, end):`
			`start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000`
			`end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000`
			`df = pd.read_csv(`
			`f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',`
			`parse_dates=['Opentime']`
			`)`
			`return df`

			`def process_data(df):`
			`df["sale_year"] = df.Opentime.dt.year`
			`df["sale_month"] = df.Opentime.dt.month`
			`df["sale_day"] = df.Opentime.dt.day`
			`df["sale_day_of_week"] = df.Opentime.dt.dayofweek`
			`df["sale_day_of_year"] = df.Opentime.dt.dayofyear`
			`df["price"] = df.Close`

			`df.drop("Opentime", axis=1, inplace=True)`
			`df.drop("Open", axis=1, inplace=True)`
			`df.drop("High", axis=1, inplace=True)`
			`df.drop("Low", axis=1, inplace=True)`
			`df.drop("Close", axis=1, inplace=True)`
			`df.drop("Volume", axis=1, inplace=True)`

			`return df`

			`def split_data(df, split_value):`
			`df_split = int(len(df) * split_value)`

			`df_train = df[:df_split]`
			`df_val = df[df_split:]`

			`X_train, y_train = df_train.drop("price", axis=1), df_train["price"]`
			`X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]`

			`return X_train, X_test, y_train, y_test`

			`def find_best_hyperparameters_and_train(X_train, y_train):`
			`global rf_reg_grid`

			`gs_rf_reg = GridSearchCV(`
			`RandomForestRegressor(),`
			`param_grid=rf_reg_grid,`
			`cv=5,`
			`refit=True,`
			`n_jobs=-1`
			`)`

			`gs_rf_reg.fit(X_train, y_train)`

			`print("Best hyperparameters: ", gs_rf_reg.best_params_)`

			`return gs_rf_reg`

			`def main():`
			`global models`
			`# Read data into panda dataframe`
			`#df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')`
			`df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')`
			`# Prepare data for moddeling`
			`df = process_data(df=df)`
			`# Split data into train and test sets`
More train 2022-09-05 13:50:10 +02:00			`X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.95)`
first commit 2022-09-05 13:30:09 +02:00
			`# Model the data`
			`model = find_best_hyperparameters_and_train(`
			`X_train=X_train,`
			`y_train=y_train`
			`)`

			`# Score our modello`
			`print('Model scored: ', model.score(X_test, y_test))`


			`if __name__ == "__main__":`
			`start_time = time.time()`
			`main()`
			`end_time = time.time() - start_time`
			`print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')`