first commit
commit
2091278767
|
@ -0,0 +1,13 @@
|
|||
FROM continuumio/miniconda:latest
|
||||
|
||||
WORKDIR /crypto_prediction
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN conda env create -f env.yml
|
||||
|
||||
RUN echo 'source activate crypto_prediction' | tee -a ~/.bashrc
|
||||
|
||||
ENV PATH /opt/conda/envs/crypto_prediction/bin:$PATH
|
||||
|
||||
ENTRYPOINT ["python", "/crypto_prediction/main.py"]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
|||
name: crypto_prediction
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1=main
|
||||
- _openmp_mutex=5.1=1_gnu
|
||||
- beautifulsoup4=4.11.1=py310h06a4308_0
|
||||
- blas=1.0=mkl
|
||||
- bottleneck=1.3.5=py310ha9d4c09_0
|
||||
- bzip2=1.0.8=h7b6447c_0
|
||||
- ca-certificates=2022.07.19=h06a4308_0
|
||||
- certifi=2022.6.15=py310h06a4308_0
|
||||
- fftw=3.3.9=h27cfd23_1
|
||||
- intel-openmp=2021.4.0=h06a4308_3561
|
||||
- joblib=1.1.0=pyhd3eb1b0_0
|
||||
- ld_impl_linux-64=2.38=h1181459_1
|
||||
- libffi=3.3=he6710b0_2
|
||||
- libgcc-ng=11.2.0=h1234567_1
|
||||
- libgfortran-ng=11.2.0=h00389a5_1
|
||||
- libgfortran5=11.2.0=h1234567_1
|
||||
- libgomp=11.2.0=h1234567_1
|
||||
- libstdcxx-ng=11.2.0=h1234567_1
|
||||
- libuuid=1.0.3=h7f8727e_2
|
||||
- mkl=2021.4.0=h06a4308_640
|
||||
- mkl-service=2.4.0=py310h7f8727e_0
|
||||
- mkl_fft=1.3.1=py310hd6ae3a3_0
|
||||
- mkl_random=1.2.2=py310h00e6091_0
|
||||
- ncurses=6.3=h5eee18b_3
|
||||
- numexpr=2.8.3=py310hcea2de6_0
|
||||
- numpy=1.21.5=py310h1794996_3
|
||||
- numpy-base=1.21.5=py310hcba007f_3
|
||||
- openssl=1.1.1q=h7f8727e_0
|
||||
- packaging=21.3=pyhd3eb1b0_0
|
||||
- pandas=1.4.3=py310h6a678d5_0
|
||||
- pip=22.1.2=py310h06a4308_0
|
||||
- pyparsing=3.0.9=py310h06a4308_0
|
||||
- python=3.10.4=h12debd9_0
|
||||
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
||||
- pytz=2022.1=py310h06a4308_0
|
||||
- readline=8.1.2=h7f8727e_1
|
||||
- scikit-learn=1.1.1=py310h6a678d5_0
|
||||
- scipy=1.7.3=py310h1794996_2
|
||||
- setuptools=63.4.1=py310h06a4308_0
|
||||
- six=1.16.0=pyhd3eb1b0_1
|
||||
- soupsieve=2.3.1=pyhd3eb1b0_0
|
||||
- sqlite=3.39.2=h5082296_0
|
||||
- threadpoolctl=2.2.0=pyh0d69192_0
|
||||
- tk=8.6.12=h1ccaba5_0
|
||||
- tzdata=2022a=hda174b7_0
|
||||
- wheel=0.37.1=pyhd3eb1b0_0
|
||||
- xz=5.2.5=h7f8727e_1
|
||||
- zlib=1.2.12=h7f8727e_2
|
||||
- pip:
|
||||
- charset-normalizer==2.1.1
|
||||
- idna==3.3
|
||||
- requests==2.28.1
|
||||
- urllib3==1.26.12
|
||||
prefix: /home/gasperspagnolo/.conda/envs/tf-venv
|
|
@ -0,0 +1,151 @@
|
|||
import pandas as pd
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
import time
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import calendar
|
||||
|
||||
rf_reg_grid = {
|
||||
'bootstrap': [True, False],
|
||||
'max_depth': [5, 10, 70, 90, 100, None],
|
||||
'max_features': ['auto', 'sqrt'],
|
||||
'min_samples_leaf': [1, 2, 4, 6],
|
||||
'min_samples_split': [2, 5, 10, 12],
|
||||
'n_estimators': [100, 200]
|
||||
}
|
||||
|
||||
best_params = {
|
||||
'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
|
||||
}
|
||||
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
|
||||
"""
|
||||
start and end must be isoformat YYYY-MM-DD
|
||||
We are using utc time zone
|
||||
the maximum records is 1000 per each Binance API call
|
||||
"""
|
||||
|
||||
df = pd.DataFrame()
|
||||
|
||||
if start is None:
|
||||
print('start time must not be None')
|
||||
exit(0)
|
||||
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
||||
|
||||
if end is None:
|
||||
dt = datetime.now(timezone.utc)
|
||||
utc_time = dt.replace(tzinfo=timezone.utc)
|
||||
end = int(utc_time.timestamp()) * 1000
|
||||
return
|
||||
else:
|
||||
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
||||
last_time = None
|
||||
|
||||
while len(df) == 0 or (last_time is not None and last_time < end):
|
||||
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
|
||||
symbol + '&interval=' + interval + '&limit=1000'
|
||||
if(len(df) == 0):
|
||||
url += '&startTime=' + str(start)
|
||||
else:
|
||||
url += '&startTime=' + str(last_time)
|
||||
url += '&endTime=' + str(end)
|
||||
|
||||
df2 = pd.read_json(url)
|
||||
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
|
||||
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
|
||||
|
||||
dftmp = pd.DataFrame()
|
||||
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
|
||||
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
|
||||
dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
|
||||
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
|
||||
column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
|
||||
|
||||
dftmp.reset_index(drop=True, inplace=True)
|
||||
dftmp = dftmp.reindex(columns=column_names)
|
||||
|
||||
last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
|
||||
|
||||
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
|
||||
|
||||
df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)
|
||||
return df
|
||||
|
||||
def get_stored_data(symbol, interval, start, end):
|
||||
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
|
||||
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
|
||||
df = pd.read_csv(
|
||||
f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
|
||||
parse_dates=['Opentime']
|
||||
)
|
||||
return df
|
||||
|
||||
def process_data(df):
|
||||
df["sale_year"] = df.Opentime.dt.year
|
||||
df["sale_month"] = df.Opentime.dt.month
|
||||
df["sale_day"] = df.Opentime.dt.day
|
||||
df["sale_day_of_week"] = df.Opentime.dt.dayofweek
|
||||
df["sale_day_of_year"] = df.Opentime.dt.dayofyear
|
||||
df["price"] = df.Close
|
||||
|
||||
df.drop("Opentime", axis=1, inplace=True)
|
||||
df.drop("Open", axis=1, inplace=True)
|
||||
df.drop("High", axis=1, inplace=True)
|
||||
df.drop("Low", axis=1, inplace=True)
|
||||
df.drop("Close", axis=1, inplace=True)
|
||||
df.drop("Volume", axis=1, inplace=True)
|
||||
|
||||
return df
|
||||
|
||||
def split_data(df, split_value):
|
||||
df_split = int(len(df) * split_value)
|
||||
|
||||
df_train = df[:df_split]
|
||||
df_val = df[df_split:]
|
||||
|
||||
X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
|
||||
X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def find_best_hyperparameters_and_train(X_train, y_train):
|
||||
global rf_reg_grid
|
||||
|
||||
gs_rf_reg = GridSearchCV(
|
||||
RandomForestRegressor(),
|
||||
param_grid=rf_reg_grid,
|
||||
cv=5,
|
||||
refit=True,
|
||||
n_jobs=-1
|
||||
)
|
||||
|
||||
gs_rf_reg.fit(X_train, y_train)
|
||||
|
||||
print("Best hyperparameters: ", gs_rf_reg.best_params_)
|
||||
|
||||
return gs_rf_reg
|
||||
|
||||
def main():
|
||||
global models
|
||||
# Read data into panda dataframe
|
||||
#df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
|
||||
df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
|
||||
# Prepare data for moddeling
|
||||
df = process_data(df=df)
|
||||
# Split data into train and test sets
|
||||
X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.82)
|
||||
|
||||
# Model the data
|
||||
model = find_best_hyperparameters_and_train(
|
||||
X_train=X_train,
|
||||
y_train=y_train
|
||||
)
|
||||
|
||||
# Score our modello
|
||||
print('Model scored: ', model.score(X_test, y_test))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
main()
|
||||
end_time = time.time() - start_time
|
||||
print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')
|
Loading…
Reference in New Issue