first commit

2022-09-05 13:30:09 +02:00 · 2022-09-05 13:30:09 +02:00 · 2091278767
commit 2091278767
4 changed files with 47094 additions and 0 deletions
--- a/13
+++ b/13
@ -0,0 +1,13 @@
 FROM continuumio/miniconda:latest
 WORKDIR /crypto_prediction
 COPY . .
 RUN conda env create -f env.yml
 RUN echo 'source activate crypto_prediction' |  tee -a ~/.bashrc
 ENV PATH /opt/conda/envs/crypto_prediction/bin:$PATH
 ENTRYPOINT ["python", "/crypto_prediction/main.py"]
--- a/data/XMRUSDT--interval-30m--start-1577836800000--end-1662249600000.csv
+++ b/data/XMRUSDT--interval-30m--start-1577836800000--end-1662249600000.csv
--- a/env.yml
+++ b/env.yml
@ -0,0 +1,58 @@
 name: crypto_prediction
 channels:
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - beautifulsoup4=4.11.1=py310h06a4308_0
  - blas=1.0=mkl
  - bottleneck=1.3.5=py310ha9d4c09_0
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2022.07.19=h06a4308_0
  - certifi=2022.6.15=py310h06a4308_0
  - fftw=3.3.9=h27cfd23_1
  - intel-openmp=2021.4.0=h06a4308_3561
  - joblib=1.1.0=pyhd3eb1b0_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.3=he6710b0_2
  - libgcc-ng=11.2.0=h1234567_1
  - libgfortran-ng=11.2.0=h00389a5_1
  - libgfortran5=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - libuuid=1.0.3=h7f8727e_2
  - mkl=2021.4.0=h06a4308_640
  - mkl-service=2.4.0=py310h7f8727e_0
  - mkl_fft=1.3.1=py310hd6ae3a3_0
  - mkl_random=1.2.2=py310h00e6091_0
  - ncurses=6.3=h5eee18b_3
  - numexpr=2.8.3=py310hcea2de6_0
  - numpy=1.21.5=py310h1794996_3
  - numpy-base=1.21.5=py310hcba007f_3
  - openssl=1.1.1q=h7f8727e_0
  - packaging=21.3=pyhd3eb1b0_0
  - pandas=1.4.3=py310h6a678d5_0
  - pip=22.1.2=py310h06a4308_0
  - pyparsing=3.0.9=py310h06a4308_0
  - python=3.10.4=h12debd9_0
  - python-dateutil=2.8.2=pyhd3eb1b0_0
  - pytz=2022.1=py310h06a4308_0
  - readline=8.1.2=h7f8727e_1
  - scikit-learn=1.1.1=py310h6a678d5_0
  - scipy=1.7.3=py310h1794996_2
  - setuptools=63.4.1=py310h06a4308_0
  - six=1.16.0=pyhd3eb1b0_1
  - soupsieve=2.3.1=pyhd3eb1b0_0
  - sqlite=3.39.2=h5082296_0
  - threadpoolctl=2.2.0=pyh0d69192_0
  - tk=8.6.12=h1ccaba5_0
  - tzdata=2022a=hda174b7_0
  - wheel=0.37.1=pyhd3eb1b0_0
  - xz=5.2.5=h7f8727e_1
  - zlib=1.2.12=h7f8727e_2
  - pip:
    - charset-normalizer==2.1.1
    - idna==3.3
    - requests==2.28.1
    - urllib3==1.26.12
 prefix: /home/gasperspagnolo/.conda/envs/tf-venv
--- a/main.py
+++ b/main.py
@ -0,0 +1,151 @@
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 import time
 from datetime import datetime, timezone, timedelta
 import calendar
 rf_reg_grid = {
    'bootstrap': [True, False],
    'max_depth': [5, 10, 70, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4, 6],
    'min_samples_split': [2, 5, 10, 12],
    'n_estimators': [100, 200]
 }
 best_params = {
    'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
 }
 def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
    """
    start and end must be isoformat YYYY-MM-DD
    We are using utc time zone
    the maximum records is 1000 per each Binance API call
    """
    df = pd.DataFrame()
    if start is None:
        print('start time must not be None')
        exit(0)
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
    if end is None:
        dt = datetime.now(timezone.utc)
        utc_time = dt.replace(tzinfo=timezone.utc)
        end = int(utc_time.timestamp()) * 1000
        return
    else:
        end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    last_time = None
    while len(df) == 0 or (last_time is not None and last_time < end):
        url = 'https://api.binance.com/api/v3/klines?symbol=' + \
              symbol + '&interval=' + interval + '&limit=1000'
        if(len(df) == 0):
            url += '&startTime=' + str(start)
        else:
            url += '&startTime=' + str(last_time)
        url += '&endTime=' + str(end)
        df2 = pd.read_json(url)
        df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
                       'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
        dftmp = pd.DataFrame()
        dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
        dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
        dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
                      'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
        column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
        dftmp.reset_index(drop=True, inplace=True)
        dftmp = dftmp.reindex(columns=column_names)
        last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
        df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
    df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',  index=False)
    return df
 def get_stored_data(symbol, interval, start, end):
    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
    end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
    df = pd.read_csv(
        f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
        parse_dates=['Opentime']
    )
    return df
 def process_data(df):
    df["sale_year"] = df.Opentime.dt.year    
    df["sale_month"] = df.Opentime.dt.month
    df["sale_day"] = df.Opentime.dt.day
    df["sale_day_of_week"] = df.Opentime.dt.dayofweek
    df["sale_day_of_year"] = df.Opentime.dt.dayofyear
    df["price"] = df.Close
    df.drop("Opentime", axis=1, inplace=True)
    df.drop("Open", axis=1, inplace=True)
    df.drop("High", axis=1, inplace=True)
    df.drop("Low", axis=1, inplace=True)
    df.drop("Close", axis=1, inplace=True)
    df.drop("Volume", axis=1, inplace=True)
    return df
 def split_data(df, split_value):
    df_split = int(len(df) * split_value)
    df_train = df[:df_split]
    df_val = df[df_split:]
    X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
    X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
    return X_train, X_test, y_train, y_test
 def find_best_hyperparameters_and_train(X_train, y_train):
    global rf_reg_grid
    gs_rf_reg = GridSearchCV(
        RandomForestRegressor(),
        param_grid=rf_reg_grid,
        cv=5,
        refit=True,
        n_jobs=-1
    )
    gs_rf_reg.fit(X_train, y_train)
    print("Best hyperparameters: ", gs_rf_reg.best_params_)
    return gs_rf_reg
 def main():
    global models
    # Read data into panda dataframe
    #df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
    df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
    # Prepare data for moddeling
    df = process_data(df=df)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.82) 
    # Model the data
    model = find_best_hyperparameters_and_train(
         X_train=X_train,
         y_train=y_train
    )
    # Score our modello
    print('Model scored: ', model.score(X_test, y_test))
 if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time() - start_time
    print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')