# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
# Cufflinks wrapper on plotly
import cufflinks
# Data science imports
import pandas as pd
import numpy as np
# Options for pandas
pd.options.display.max_columns = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
And get interesting features
from src.load_datasets import load_datasets
from src.prepare_datasets import feature_list
train, test = load_datasets()
train_features = train[feature_list]
test_features = test[feature_list]
train_features.index = pd.to_datetime(train.pop('timestamp'), unit='ms')
test_features.index = pd.to_datetime(test.pop('timestamp'), unit='ms')
train_features
UserWarning: Modin Ray engine was started with 396 GB free space avaliable, if it is not enough for your application, please set environment variable MODIN_ON_RAY_PLASMA_DIR=/directory/without/space/limiting
high | low | open | close | volume | |
---|---|---|---|---|---|
timestamp | |||||
2016-08-10 15:53:00 | 579.000000 | 579.00 | 579.00 | 579.000000 | 1.000000 |
2016-08-10 15:54:00 | 604.750000 | 592.96 | 592.96 | 604.750000 | 131.838200 |
2016-08-10 15:55:00 | 604.750000 | 600.00 | 604.75 | 600.000000 | 89.437926 |
2016-08-10 15:57:00 | 604.750000 | 600.00 | 600.00 | 604.750000 | 51.328200 |
2016-08-10 15:58:00 | 604.750000 | 604.75 | 604.75 | 604.750000 | 4.586500 |
... | ... | ... | ... | ... | ... |
2020-04-22 12:37:00 | 6946.000000 | 6945.00 | 6945.90 | 6946.000000 | 0.130838 |
2020-04-22 12:38:00 | 6948.700000 | 6946.00 | 6946.00 | 6948.684645 | 0.951414 |
2020-04-22 12:39:00 | 6951.300000 | 6947.10 | 6948.70 | 6950.100000 | 0.055320 |
2020-04-22 12:40:00 | 6952.628959 | 6951.30 | 6951.30 | 6952.628959 | 0.168686 |
2020-04-22 12:41:00 | 6954.692469 | 6953.90 | 6953.90 | 6954.600000 | 0.046001 |
1816296 rows × 5 columns
In theory we are going to use 4 features: The price itself and three extra technical indicators.
MACD (Trend) Stochastics (Momentum) Average True Range (Volume)
Exponential Moving Average: Is a type of infinite impulse response filter that applies weighting factors which decrease exponentially. The weighting for each older datum decreases exponentially, never reaching zero.
MACD: The Moving Average Convergence/Divergence oscillator (MACD) is one of the simplest and most effective momentum indicators available. The MACD turns two trend-following indicators, moving averages, into a momentum oscillator by subtracting the longer moving average from the shorter moving average.
Stochastics oscillator: The Stochastic Oscillator is a momentum indicator that shows the location of the close relative to the high-low range over a set number of periods.
Average True Range: Is an indicator to measure the volalitility (NOT price direction). The largest of:
from ta import add_all_ta_features
from ta.utils import dropna
days_to_show = 60
items_to_show = days_to_show * 24 * 60
# Dropna from ta also remove zeros and max double value
df_show = dropna(train_features[-items_to_show:])
df_show = add_all_ta_features(
df_show,
open="open",
high="high",
low="low",
close="close",
volume="volume",
fillna=True
)
df_show = df_show[[
'open', 'high', 'low', 'close', 'volume',
'volatility_bbm', 'volatility_bbh', 'volatility_bbl',
'trend_macd', 'momentum_rsi', 'volatility_kchi',
'trend_ichimoku_conv', 'trend_ichimoku_a', 'trend_ichimoku_b',
'momentum_stoch', 'momentum_stoch_signal', 'volatility_atr'
]]
RuntimeWarning: invalid value encountered in double_scalars RuntimeWarning: invalid value encountered in double_scalars
df_show[['trend_macd', 'close']].iplot(subplots=True)
df_show[['momentum_stoch','momentum_stoch_signal', 'close']].iplot(subplots=True)
df_show[['volatility_atr', 'close']].iplot(subplots=True)
import scipy.stats as stats
import pylab
close_change = train_features['close'].pct_change()[1:]
close_change.head()
stats.probplot(close_change, dist='norm', plot=pylab)
timestamp 2016-08-10 15:54:00 0.044473 2016-08-10 15:55:00 -0.007854 2016-08-10 15:57:00 0.007917 2016-08-10 15:58:00 0.000000 2016-08-10 15:59:00 0.000000 Name: close, dtype: float64
((array([-4.94453614, -4.76886449, -4.67400546, ..., 4.67400546, 4.76886449, 4.94453614]), array([-0.17508113, -0.12714831, -0.0479099 , ..., 0.06530612, 0.06600338, 0.12148934])), (0.00107797240830823, 2.207475798096329e-06, 0.8328053817597639))
import tensorflow as tf
import matplotlib.pyplot as plt
def plot_log_freaquency(series):
fft = tf.signal.rfft(series)
f_per_dataset = np.arange(0, len(fft))
n_samples_d = len(series)
days_per_year = 365
years_per_dataset = n_samples_d/(days_per_year)
f_per_year = f_per_dataset/years_per_dataset
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.xticks([1, 365], labels=['1/Year', '1/day'])
_ = plt.xlabel('Frequency (log scale)')
plot_log_freaquency(train_features['close'])
plot_log_freaquency(train_features['close'].diff().dropna())
plot_log_freaquency(train_features['volume'])
plot_log_freaquency(train_features['volume'].diff().dropna())
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-91fe55b3e735> in <module> ----> 1 plot_log_freaquency(train_features['volume'].diff().dropna()) NameError: name 'plot_log_freaquency' is not defined
import sweetviz as sv
compare_report = sv.compare([train_features, 'Train data'], [test_features, 'Test data'], "close")
compare_report.show_notebook()
train_features[59::60].iplot(subplots=True)
test_features[59::60].iplot(subplots=True)
Will use only training mean and deviation for not give NN access to test dataset
Divide by the max-min deviation
pd.set_option('float_format', '{:.2f}'.format)
train_features.describe()
high | low | open | close | volume | |
---|---|---|---|---|---|
count | 1816296.00 | 1816296.00 | 1816296.00 | 1816296.00 | 1816296.00 |
mean | 6145.04 | 6138.47 | 6141.76 | 6141.75 | 17.74 |
std | 3544.95 | 3538.94 | 3541.95 | 3541.94 | 52.61 |
min | 563.24 | 562.99 | 563.10 | 563.00 | 0.00 |
25% | 3588.77 | 3586.88 | 3587.70 | 3587.80 | 0.69 |
50% | 6497.00 | 6494.50 | 6495.63 | 6495.60 | 3.94 |
75% | 8481.90 | 8475.00 | 8479.08 | 8479.00 | 15.13 |
max | 19891.00 | 19880.00 | 19890.00 | 19891.00 | 6717.52 |
test_features.describe()
high | low | open | close | volume | |
---|---|---|---|---|---|
count | 454074.00 | 454074.00 | 454074.00 | 454074.00 | 454074.00 |
mean | 18674.91 | 18652.66 | 18663.90 | 18663.96 | 5.55 |
std | 12974.03 | 12947.02 | 12960.61 | 12960.68 | 23.19 |
min | 6955.20 | 6952.80 | 6954.70 | 6955.04 | 0.00 |
25% | 9649.30 | 9645.11 | 9647.40 | 9647.35 | 0.18 |
50% | 11656.00 | 11651.75 | 11654.00 | 11654.00 | 0.97 |
75% | 23178.00 | 23150.00 | 23165.00 | 23163.99 | 3.73 |
max | 58321.24 | 58304.00 | 58317.00 | 58317.00 | 2982.69 |
maximum for training to litle, and not will allow correctly predict values in testing dataset, will use manually choosed value for maximum 100 thouthands dollars except of volume
from sklearn.preprocessing import MinMaxScaler
import numpy as np
train_min = np.min(train_features)
train_max = np.max(train_features)
MAX_TARGET = 100000
train_max['high'] = MAX_TARGET
train_max['low'] = MAX_TARGET
train_max['open'] = MAX_TARGET
train_max['close'] = MAX_TARGET
train_fit = pd.DataFrame([train_min, train_max])
scaler = MinMaxScaler()
scaler = scaler.fit(train_fit)
array([1.00566430e-05, 1.00566178e-05, 1.00566289e-05, 1.00566188e-05, 1.48864535e-04])
array([-5.66430362e-03, -5.66177523e-03, -5.66288772e-03, -5.66187636e-03, -1.48864535e-12])
print("normalise train dataset...")
train_normalised = pd.DataFrame(scaler.transform(train_features))
train_normalised.columns = train_features.columns
train_normalised.index = train_features.index
print("normalise test dataset...")
test_normalised = pd.DataFrame(scaler.transform(test_features))
test_normalised.columns = test_features.columns
test_normalised.index = test_features.index
train_normalised.head()
normalise train dataset... normalise test dataset...
high | low | open | close | volume | |
---|---|---|---|---|---|
timestamp | |||||
2016-08-10 15:53:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
2016-08-10 15:54:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.02 |
2016-08-10 15:55:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.01 |
2016-08-10 15:57:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.01 |
2016-08-10 15:58:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
train_normalised[59::60].iplot(subplots=True, title="Train")
test_normalised[59::60].iplot(subplots=True, title="Test")
train_in_hours = train_features[59::60]
feature2normaliesd = pd.DataFrame({
'Real': train_in_hours['close'],
'Normalised': train_normalised['close'][59::60]
})
feature2normaliesd.index = train_in_hours.index
feature2normaliesd.iplot(subplots=True)