first commit

main
Gasper Spagnolo 2022-09-05 13:30:09 +02:00
commit 2091278767
4 changed files with 47094 additions and 0 deletions

13
Dockerfile Normal file
View File

@ -0,0 +1,13 @@
FROM continuumio/miniconda:latest
WORKDIR /crypto_prediction
COPY . .
RUN conda env create -f env.yml
RUN echo 'source activate crypto_prediction' | tee -a ~/.bashrc
ENV PATH /opt/conda/envs/crypto_prediction/bin:$PATH
ENTRYPOINT ["python", "/crypto_prediction/main.py"]

File diff suppressed because it is too large Load Diff

58
env.yml Normal file
View File

@ -0,0 +1,58 @@
name: crypto_prediction
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- beautifulsoup4=4.11.1=py310h06a4308_0
- blas=1.0=mkl
- bottleneck=1.3.5=py310ha9d4c09_0
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2022.07.19=h06a4308_0
- certifi=2022.6.15=py310h06a4308_0
- fftw=3.3.9=h27cfd23_1
- intel-openmp=2021.4.0=h06a4308_3561
- joblib=1.1.0=pyhd3eb1b0_0
- ld_impl_linux-64=2.38=h1181459_1
- libffi=3.3=he6710b0_2
- libgcc-ng=11.2.0=h1234567_1
- libgfortran-ng=11.2.0=h00389a5_1
- libgfortran5=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.0.3=h7f8727e_2
- mkl=2021.4.0=h06a4308_640
- mkl-service=2.4.0=py310h7f8727e_0
- mkl_fft=1.3.1=py310hd6ae3a3_0
- mkl_random=1.2.2=py310h00e6091_0
- ncurses=6.3=h5eee18b_3
- numexpr=2.8.3=py310hcea2de6_0
- numpy=1.21.5=py310h1794996_3
- numpy-base=1.21.5=py310hcba007f_3
- openssl=1.1.1q=h7f8727e_0
- packaging=21.3=pyhd3eb1b0_0
- pandas=1.4.3=py310h6a678d5_0
- pip=22.1.2=py310h06a4308_0
- pyparsing=3.0.9=py310h06a4308_0
- python=3.10.4=h12debd9_0
- python-dateutil=2.8.2=pyhd3eb1b0_0
- pytz=2022.1=py310h06a4308_0
- readline=8.1.2=h7f8727e_1
- scikit-learn=1.1.1=py310h6a678d5_0
- scipy=1.7.3=py310h1794996_2
- setuptools=63.4.1=py310h06a4308_0
- six=1.16.0=pyhd3eb1b0_1
- soupsieve=2.3.1=pyhd3eb1b0_0
- sqlite=3.39.2=h5082296_0
- threadpoolctl=2.2.0=pyh0d69192_0
- tk=8.6.12=h1ccaba5_0
- tzdata=2022a=hda174b7_0
- wheel=0.37.1=pyhd3eb1b0_0
- xz=5.2.5=h7f8727e_1
- zlib=1.2.12=h7f8727e_2
- pip:
- charset-normalizer==2.1.1
- idna==3.3
- requests==2.28.1
- urllib3==1.26.12
prefix: /home/gasperspagnolo/.conda/envs/tf-venv

151
main.py Normal file
View File

@ -0,0 +1,151 @@
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import time
from datetime import datetime, timezone, timedelta
import calendar
rf_reg_grid = {
'bootstrap': [True, False],
'max_depth': [5, 10, 70, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4, 6],
'min_samples_split': [2, 5, 10, 12],
'n_estimators': [100, 200]
}
best_params = {
'bootstrap': [True], 'max_depth': [90], 'max_features': ['auto'], 'min_samples_leaf': [4], 'min_samples_split': [10], 'n_estimators': [100]
}
def get_data_and_store_csv(symbol, interval, start, end = None, limit=1000):
"""
start and end must be isoformat YYYY-MM-DD
We are using utc time zone
the maximum records is 1000 per each Binance API call
"""
df = pd.DataFrame()
if start is None:
print('start time must not be None')
exit(0)
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
if end is None:
dt = datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
end = int(utc_time.timestamp()) * 1000
return
else:
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
last_time = None
while len(df) == 0 or (last_time is not None and last_time < end):
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=1000'
if(len(df) == 0):
url += '&startTime=' + str(start)
else:
url += '&startTime=' + str(last_time)
url += '&endTime=' + str(end)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
dftmp = pd.DataFrame()
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
dftmp = dftmp.drop(['Quote asset volume', 'Closetime',
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
column_names = ["Opentime", "Open", "High", "Low", "Close", "Volume"]
dftmp.reset_index(drop=True, inplace=True)
dftmp = dftmp.reindex(columns=column_names)
last_time = (dftmp['Opentime'][len(dftmp) - 1] - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
df.to_csv(f'data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv', index=False)
return df
def get_stored_data(symbol, interval, start, end):
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
df = pd.read_csv(
f'/crypto_prediction/data/{symbol}--interval-{interval}--start-{start}--end-{end}.csv',
parse_dates=['Opentime']
)
return df
def process_data(df):
df["sale_year"] = df.Opentime.dt.year
df["sale_month"] = df.Opentime.dt.month
df["sale_day"] = df.Opentime.dt.day
df["sale_day_of_week"] = df.Opentime.dt.dayofweek
df["sale_day_of_year"] = df.Opentime.dt.dayofyear
df["price"] = df.Close
df.drop("Opentime", axis=1, inplace=True)
df.drop("Open", axis=1, inplace=True)
df.drop("High", axis=1, inplace=True)
df.drop("Low", axis=1, inplace=True)
df.drop("Close", axis=1, inplace=True)
df.drop("Volume", axis=1, inplace=True)
return df
def split_data(df, split_value):
df_split = int(len(df) * split_value)
df_train = df[:df_split]
df_val = df[df_split:]
X_train, y_train = df_train.drop("price", axis=1), df_train["price"]
X_test, y_test = df_val.drop("price", axis = 1), df_val["price"]
return X_train, X_test, y_train, y_test
def find_best_hyperparameters_and_train(X_train, y_train):
global rf_reg_grid
gs_rf_reg = GridSearchCV(
RandomForestRegressor(),
param_grid=rf_reg_grid,
cv=5,
refit=True,
n_jobs=-1
)
gs_rf_reg.fit(X_train, y_train)
print("Best hyperparameters: ", gs_rf_reg.best_params_)
return gs_rf_reg
def main():
global models
# Read data into panda dataframe
#df = get_data_and_store_csv('XMRUSDT', '30m', '2022-09-01', '2022-09-04')
df = get_stored_data('XMRUSDT', '30m', '2020-01-01', '2022-09-04')
# Prepare data for moddeling
df = process_data(df=df)
# Split data into train and test sets
X_train, X_test, y_train, y_test = split_data(df=df, split_value=0.82)
# Model the data
model = find_best_hyperparameters_and_train(
X_train=X_train,
y_train=y_train
)
# Score our modello
print('Model scored: ', model.score(X_test, y_test))
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time() - start_time
print(f'\n\nCompleted...\t\t\t\t\t{end_time // 3600} h {(end_time // 60) % 60} m {int(end_time % 60)}s')