first commit

2022-09-05 13:30:09 +02:00 · 2022-09-05 13:30:09 +02:00 · 2091278767
commit 2091278767
4 changed files with 47094 additions and 0 deletions
--- a/13
+++ b/13
@ -0,0 +1,13 @@
+FROM continuumio/miniconda:latest
+
+WORKDIR /crypto_prediction
+
+COPY . .
+
+RUN conda env create -f env.yml
+
+RUN echo 'source activate crypto_prediction' |  tee -a ~/.bashrc
+
+ENV PATH /opt/conda/envs/crypto_prediction/bin:$PATH
+
+ENTRYPOINT ["python", "/crypto_prediction/main.py"]
--- a/data/XMRUSDT--interval-30m--start-1577836800000--end-1662249600000.csv
+++ b/data/XMRUSDT--interval-30m--start-1577836800000--end-1662249600000.csv
--- a/env.yml
+++ b/env.yml
@ -0,0 +1,58 @@
+name: crypto_prediction
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - beautifulsoup4=4.11.1=py310h06a4308_0
+  - blas=1.0=mkl
+  - bottleneck=1.3.5=py310ha9d4c09_0
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2022.07.19=h06a4308_0
+  - certifi=2022.6.15=py310h06a4308_0
+  - fftw=3.3.9=h27cfd23_1
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - joblib=1.1.0=pyhd3eb1b0_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=11.2.0=h00389a5_1
+  - libgfortran5=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.0.3=h7f8727e_2
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py310h7f8727e_0
+  - mkl_fft=1.3.1=py310hd6ae3a3_0
+  - mkl_random=1.2.2=py310h00e6091_0
+  - ncurses=6.3=h5eee18b_3
+  - numexpr=2.8.3=py310hcea2de6_0
+  - numpy=1.21.5=py310h1794996_3
+  - numpy-base=1.21.5=py310hcba007f_3
+  - openssl=1.1.1q=h7f8727e_0
+  - packaging=21.3=pyhd3eb1b0_0
+  - pandas=1.4.3=py310h6a678d5_0
+  - pip=22.1.2=py310h06a4308_0
+  - pyparsing=3.0.9=py310h06a4308_0
+  - python=3.10.4=h12debd9_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - pytz=2022.1=py310h06a4308_0
+  - readline=8.1.2=h7f8727e_1
+  - scikit-learn=1.1.1=py310h6a678d5_0
+  - scipy=1.7.3=py310h1794996_2
+  - setuptools=63.4.1=py310h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - soupsieve=2.3.1=pyhd3eb1b0_0
+  - sqlite=3.39.2=h5082296_0
+  - threadpoolctl=2.2.0=pyh0d69192_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2022a=hda174b7_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - zlib=1.2.12=h7f8727e_2
+  - pip:
+    - charset-normalizer==2.1.1
+    - idna==3.3
+    - requests==2.28.1
+    - urllib3==1.26.12
+prefix: /home/gasperspagnolo/.conda/envs/tf-venv
--- a/main.py
+++ b/main.py
@ -0,0 +1,151 @@
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+from sklearn.ensemble import RandomForestRegressor
+import time
+from datetime import datetime, timezone, timedelta
+import calendar
+
+rf_reg_grid = {
+    'bootstrap': [True, False],
+    'max_depth': [5, 10, 70, 90, 100, None],
+    'max_features': ['auto', 'sqrt'],
+    'min_samples_leaf': [1, 2, 4, 6],
+    'min_samples_split': [2, 5, 10, 12],
+    'n_estimators': [100, 200]
+}
+
+best_params = {
+    'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
+}
+def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
+    """
+    start and end must be isoformat YYYY-MM-DD
+    We are using utc time zone
+    the maximum records is 1000 per each Binance API call
+    """
+
+    df = pd.DataFrame()
+
+    if start is None:
+        print('start time must not be None')
+        exit(0)
+    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
+
+    if end is None:
+        dt = datetime.now(timezone.utc)
+        utc_time = dt.replace(tzinfo=timezone.utc)
+        end = int(utc_time.timestamp()) * 1000
+        return
+    else:
+        end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
+    last_time = None
+
+    while len(df) == 0 or (last_time is not None and last_time < end):
+        url = 'https://api.binance.com/api/v3/klines?symbol=' + \
+              symbol + '&interval=' + interval + '&limit=1000'
+        if(len(df) == 0):
+            url += '&startTime=' + str(start)
+        else:
+            url += '&startTime=' + str(last_time)
+        url += '&endTime=' + str(end)
+
+        df2 = pd.read_json(url)
+        df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
+                       'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
+
+        dftmp = pd.DataFrame()
+        dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
+        dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
+        dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
+                      'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
+        column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
+
+        dftmp.reset_index(drop=True, inplace=True)
+        dftmp = dftmp.reindex(columns=column_names)
+
+        last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
+
+        df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
+
+    df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',  index=False)
+    return df
+
+def get_stored_data(symbol, interval, start, end):
+    start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
+    end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
+    df = pd.read_csv(
+        f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
+        parse_dates=['Opentime']
+    )
+    return df
+
+def process_data(df):
+    df["sale_year"] = df.Opentime.dt.year    
+    df["sale_month"] = df.Opentime.dt.month
+    df["sale_day"] = df.Opentime.dt.day
+    df["sale_day_of_week"] = df.Opentime.dt.dayofweek
+    df["sale_day_of_year"] = df.Opentime.dt.dayofyear
+    df["price"] = df.Close
+
+    df.drop("Opentime", axis=1, inplace=True)
+    df.drop("Open", axis=1, inplace=True)
+    df.drop("High", axis=1, inplace=True)
+    df.drop("Low", axis=1, inplace=True)
+    df.drop("Close", axis=1, inplace=True)
+    df.drop("Volume", axis=1, inplace=True)
+
+    return df
+
+def split_data(df, split_value):
+    df_split = int(len(df) * split_value)
+    
+    df_train = df[:df_split]
+    df_val = df[df_split:]
+
+    X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
+    X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
+
+    return X_train, X_test, y_train, y_test
+
+def find_best_hyperparameters_and_train(X_train, y_train):
+    global rf_reg_grid
+
+    gs_rf_reg = GridSearchCV(
+        RandomForestRegressor(),
+        param_grid=rf_reg_grid,
+        cv=5,
+        refit=True,
+        n_jobs=-1
+    )
+
+    gs_rf_reg.fit(X_train, y_train)
+
+    print("Best hyperparameters: ", gs_rf_reg.best_params_)
+
+    return gs_rf_reg
+
+def main():
+    global models
+    # Read data into panda dataframe
+    #df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
+    df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
+    # Prepare data for moddeling
+    df = process_data(df=df)
+    # Split data into train and test sets
+    X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.82) 
+
+    # Model the data
+    model = find_best_hyperparameters_and_train(
+         X_train=X_train,
+         y_train=y_train
+    )
+
+    # Score our modello
+    print('Model scored: ', model.score(X_test, y_test))
+
+
+if __name__ == "__main__":
+    start_time = time.time()
+    main()
+    end_time = time.time() - start_time
+    print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')