first commit
commit
2091278767
|
@ -0,0 +1,13 @@
|
||||||
|
FROM continuumio/miniconda:latest
|
||||||
|
|
||||||
|
WORKDIR /crypto_prediction
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN conda env create -f env.yml
|
||||||
|
|
||||||
|
RUN echo 'source activate crypto_prediction' | tee -a ~/.bashrc
|
||||||
|
|
||||||
|
ENV PATH /opt/conda/envs/crypto_prediction/bin:$PATH
|
||||||
|
|
||||||
|
ENTRYPOINT ["python", "/crypto_prediction/main.py"]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
||||||
|
name: crypto_prediction
|
||||||
|
channels:
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- _libgcc_mutex=0.1=main
|
||||||
|
- _openmp_mutex=5.1=1_gnu
|
||||||
|
- beautifulsoup4=4.11.1=py310h06a4308_0
|
||||||
|
- blas=1.0=mkl
|
||||||
|
- bottleneck=1.3.5=py310ha9d4c09_0
|
||||||
|
- bzip2=1.0.8=h7b6447c_0
|
||||||
|
- ca-certificates=2022.07.19=h06a4308_0
|
||||||
|
- certifi=2022.6.15=py310h06a4308_0
|
||||||
|
- fftw=3.3.9=h27cfd23_1
|
||||||
|
- intel-openmp=2021.4.0=h06a4308_3561
|
||||||
|
- joblib=1.1.0=pyhd3eb1b0_0
|
||||||
|
- ld_impl_linux-64=2.38=h1181459_1
|
||||||
|
- libffi=3.3=he6710b0_2
|
||||||
|
- libgcc-ng=11.2.0=h1234567_1
|
||||||
|
- libgfortran-ng=11.2.0=h00389a5_1
|
||||||
|
- libgfortran5=11.2.0=h1234567_1
|
||||||
|
- libgomp=11.2.0=h1234567_1
|
||||||
|
- libstdcxx-ng=11.2.0=h1234567_1
|
||||||
|
- libuuid=1.0.3=h7f8727e_2
|
||||||
|
- mkl=2021.4.0=h06a4308_640
|
||||||
|
- mkl-service=2.4.0=py310h7f8727e_0
|
||||||
|
- mkl_fft=1.3.1=py310hd6ae3a3_0
|
||||||
|
- mkl_random=1.2.2=py310h00e6091_0
|
||||||
|
- ncurses=6.3=h5eee18b_3
|
||||||
|
- numexpr=2.8.3=py310hcea2de6_0
|
||||||
|
- numpy=1.21.5=py310h1794996_3
|
||||||
|
- numpy-base=1.21.5=py310hcba007f_3
|
||||||
|
- openssl=1.1.1q=h7f8727e_0
|
||||||
|
- packaging=21.3=pyhd3eb1b0_0
|
||||||
|
- pandas=1.4.3=py310h6a678d5_0
|
||||||
|
- pip=22.1.2=py310h06a4308_0
|
||||||
|
- pyparsing=3.0.9=py310h06a4308_0
|
||||||
|
- python=3.10.4=h12debd9_0
|
||||||
|
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
||||||
|
- pytz=2022.1=py310h06a4308_0
|
||||||
|
- readline=8.1.2=h7f8727e_1
|
||||||
|
- scikit-learn=1.1.1=py310h6a678d5_0
|
||||||
|
- scipy=1.7.3=py310h1794996_2
|
||||||
|
- setuptools=63.4.1=py310h06a4308_0
|
||||||
|
- six=1.16.0=pyhd3eb1b0_1
|
||||||
|
- soupsieve=2.3.1=pyhd3eb1b0_0
|
||||||
|
- sqlite=3.39.2=h5082296_0
|
||||||
|
- threadpoolctl=2.2.0=pyh0d69192_0
|
||||||
|
- tk=8.6.12=h1ccaba5_0
|
||||||
|
- tzdata=2022a=hda174b7_0
|
||||||
|
- wheel=0.37.1=pyhd3eb1b0_0
|
||||||
|
- xz=5.2.5=h7f8727e_1
|
||||||
|
- zlib=1.2.12=h7f8727e_2
|
||||||
|
- pip:
|
||||||
|
- charset-normalizer==2.1.1
|
||||||
|
- idna==3.3
|
||||||
|
- requests==2.28.1
|
||||||
|
- urllib3==1.26.12
|
||||||
|
prefix: /home/gasperspagnolo/.conda/envs/tf-venv
|
|
@ -0,0 +1,151 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
import calendar
|
||||||
|
|
||||||
|
rf_reg_grid = {
|
||||||
|
'bootstrap': [True, False],
|
||||||
|
'max_depth': [5, 10, 70, 90, 100, None],
|
||||||
|
'max_features': ['auto', 'sqrt'],
|
||||||
|
'min_samples_leaf': [1, 2, 4, 6],
|
||||||
|
'min_samples_split': [2, 5, 10, 12],
|
||||||
|
'n_estimators': [100, 200]
|
||||||
|
}
|
||||||
|
|
||||||
|
best_params = {
|
||||||
|
'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
|
||||||
|
}
|
||||||
|
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
|
||||||
|
"""
|
||||||
|
start and end must be isoformat YYYY-MM-DD
|
||||||
|
We are using utc time zone
|
||||||
|
the maximum records is 1000 per each Binance API call
|
||||||
|
"""
|
||||||
|
|
||||||
|
df = pd.DataFrame()
|
||||||
|
|
||||||
|
if start is None:
|
||||||
|
print('start time must not be None')
|
||||||
|
exit(0)
|
||||||
|
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
||||||
|
|
||||||
|
if end is None:
|
||||||
|
dt = datetime.now(timezone.utc)
|
||||||
|
utc_time = dt.replace(tzinfo=timezone.utc)
|
||||||
|
end = int(utc_time.timestamp()) * 1000
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
||||||
|
last_time = None
|
||||||
|
|
||||||
|
while len(df) == 0 or (last_time is not None and last_time < end):
|
||||||
|
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
|
||||||
|
symbol + '&interval=' + interval + '&limit=1000'
|
||||||
|
if(len(df) == 0):
|
||||||
|
url += '&startTime=' + str(start)
|
||||||
|
else:
|
||||||
|
url += '&startTime=' + str(last_time)
|
||||||
|
url += '&endTime=' + str(end)
|
||||||
|
|
||||||
|
df2 = pd.read_json(url)
|
||||||
|
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
|
||||||
|
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
|
||||||
|
|
||||||
|
dftmp = pd.DataFrame()
|
||||||
|
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
|
||||||
|
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
|
||||||
|
dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
|
||||||
|
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
|
||||||
|
column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
|
||||||
|
|
||||||
|
dftmp.reset_index(drop=True, inplace=True)
|
||||||
|
dftmp = dftmp.reindex(columns=column_names)
|
||||||
|
|
||||||
|
last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
|
||||||
|
|
||||||
|
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
|
||||||
|
|
||||||
|
df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def get_stored_data(symbol, interval, start, end):
|
||||||
|
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
||||||
|
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
||||||
|
df = pd.read_csv(
|
||||||
|
f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
|
||||||
|
parse_dates=['Opentime']
|
||||||
|
)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def process_data(df):
|
||||||
|
df["sale_year"] = df.Opentime.dt.year
|
||||||
|
df["sale_month"] = df.Opentime.dt.month
|
||||||
|
df["sale_day"] = df.Opentime.dt.day
|
||||||
|
df["sale_day_of_week"] = df.Opentime.dt.dayofweek
|
||||||
|
df["sale_day_of_year"] = df.Opentime.dt.dayofyear
|
||||||
|
df["price"] = df.Close
|
||||||
|
|
||||||
|
df.drop("Opentime", axis=1, inplace=True)
|
||||||
|
df.drop("Open", axis=1, inplace=True)
|
||||||
|
df.drop("High", axis=1, inplace=True)
|
||||||
|
df.drop("Low", axis=1, inplace=True)
|
||||||
|
df.drop("Close", axis=1, inplace=True)
|
||||||
|
df.drop("Volume", axis=1, inplace=True)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def split_data(df, split_value):
|
||||||
|
df_split = int(len(df) * split_value)
|
||||||
|
|
||||||
|
df_train = df[:df_split]
|
||||||
|
df_val = df[df_split:]
|
||||||
|
|
||||||
|
X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
|
||||||
|
X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
|
||||||
|
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
def find_best_hyperparameters_and_train(X_train, y_train):
|
||||||
|
global rf_reg_grid
|
||||||
|
|
||||||
|
gs_rf_reg = GridSearchCV(
|
||||||
|
RandomForestRegressor(),
|
||||||
|
param_grid=rf_reg_grid,
|
||||||
|
cv=5,
|
||||||
|
refit=True,
|
||||||
|
n_jobs=-1
|
||||||
|
)
|
||||||
|
|
||||||
|
gs_rf_reg.fit(X_train, y_train)
|
||||||
|
|
||||||
|
print("Best hyperparameters: ", gs_rf_reg.best_params_)
|
||||||
|
|
||||||
|
return gs_rf_reg
|
||||||
|
|
||||||
|
def main():
|
||||||
|
global models
|
||||||
|
# Read data into panda dataframe
|
||||||
|
#df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
|
||||||
|
df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
|
||||||
|
# Prepare data for moddeling
|
||||||
|
df = process_data(df=df)
|
||||||
|
# Split data into train and test sets
|
||||||
|
X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.82)
|
||||||
|
|
||||||
|
# Model the data
|
||||||
|
model = find_best_hyperparameters_and_train(
|
||||||
|
X_train=X_train,
|
||||||
|
y_train=y_train
|
||||||
|
)
|
||||||
|
|
||||||
|
# Score our modello
|
||||||
|
print('Model scored: ', model.score(X_test, y_test))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_time = time.time()
|
||||||
|
main()
|
||||||
|
end_time = time.time() - start_time
|
||||||
|
print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
|
Loading…
Reference in New Issue