# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')


from src.load_datasets import load_datasets
from src.prepare_datasets import feature_list

train, test = load_datasets()

train_features = train[feature_list]
test_features = test[feature_list]

train_features.index = pd.to_datetime(train.pop('timestamp'), unit='ms')
test_features.index = pd.to_datetime(test.pop('timestamp'), unit='ms')

train_features

UserWarning: Modin Ray engine was started with 396 GB free space avaliable, if it is not enough for your application, please set environment variable MODIN_ON_RAY_PLASMA_DIR=/directory/without/space/limiting


from ta import add_all_ta_features
from ta.utils import dropna 

days_to_show = 60
items_to_show = days_to_show * 24 * 60

# Dropna from ta also remove zeros and max double value
df_show = dropna(train_features[-items_to_show:])

df_show = add_all_ta_features(
    df_show, 
    open="open", 
    high="high", 
    low="low", 
    close="close", 
    volume="volume", 
    fillna=True
)
df_show = df_show[[
         'open', 'high', 'low', 'close', 'volume',
         'volatility_bbm', 'volatility_bbh', 'volatility_bbl',
         'trend_macd', 'momentum_rsi', 'volatility_kchi',
         'trend_ichimoku_conv', 'trend_ichimoku_a', 'trend_ichimoku_b',
         'momentum_stoch', 'momentum_stoch_signal', 'volatility_atr'
]]

RuntimeWarning: invalid value encountered in double_scalars
RuntimeWarning: invalid value encountered in double_scalars


df_show[['trend_macd', 'close']].iplot(subplots=True)


df_show[['momentum_stoch','momentum_stoch_signal', 'close']].iplot(subplots=True)


df_show[['volatility_atr', 'close']].iplot(subplots=True)


import scipy.stats as stats
import pylab

close_change = train_features['close'].pct_change()[1:]
close_change.head()

stats.probplot(close_change, dist='norm', plot=pylab)

timestamp
2016-08-10 15:54:00    0.044473
2016-08-10 15:55:00   -0.007854
2016-08-10 15:57:00    0.007917
2016-08-10 15:58:00    0.000000
2016-08-10 15:59:00    0.000000
Name: close, dtype: float64

((array([-4.94453614, -4.76886449, -4.67400546, ...,  4.67400546,
          4.76886449,  4.94453614]),
  array([-0.17508113, -0.12714831, -0.0479099 , ...,  0.06530612,
          0.06600338,  0.12148934])),
 (0.00107797240830823, 2.207475798096329e-06, 0.8328053817597639))


import tensorflow as tf
import matplotlib.pyplot as plt

def plot_log_freaquency(series):
    fft = tf.signal.rfft(series)    
    f_per_dataset = np.arange(0, len(fft))

    n_samples_d = len(series)
    days_per_year = 365
    years_per_dataset = n_samples_d/(days_per_year)

    f_per_year = f_per_dataset/years_per_dataset
    plt.step(f_per_year, np.abs(fft))
    plt.xscale('log')
    plt.xticks([1, 365], labels=['1/Year', '1/day'])
    _ = plt.xlabel('Frequency (log scale)')


plot_log_freaquency(train_features['close'])


plot_log_freaquency(train_features['close'].diff().dropna())


plot_log_freaquency(train_features['volume'])


plot_log_freaquency(train_features['volume'].diff().dropna())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-91fe55b3e735> in <module>
----> 1 plot_log_freaquency(train_features['volume'].diff().dropna())

NameError: name 'plot_log_freaquency' is not defined


import sweetviz as sv

compare_report = sv.compare([train_features, 'Train data'], [test_features, 'Test data'], "close")
compare_report.show_notebook()


train_features[59::60].iplot(subplots=True)


test_features[59::60].iplot(subplots=True)


pd.set_option('float_format', '{:.2f}'.format)

train_features.describe()


test_features.describe()


from sklearn.preprocessing import MinMaxScaler
import numpy as np

train_min = np.min(train_features)
train_max = np.max(train_features)

MAX_TARGET = 100000

train_max['high'] = MAX_TARGET
train_max['low'] = MAX_TARGET
train_max['open'] = MAX_TARGET
train_max['close'] = MAX_TARGET

train_fit = pd.DataFrame([train_min, train_max])

scaler = MinMaxScaler()
scaler = scaler.fit(train_fit)

array([1.00566430e-05, 1.00566178e-05, 1.00566289e-05, 1.00566188e-05,
       1.48864535e-04])

array([-5.66430362e-03, -5.66177523e-03, -5.66288772e-03, -5.66187636e-03,
       -1.48864535e-12])


print("normalise train dataset...")
train_normalised = pd.DataFrame(scaler.transform(train_features))
train_normalised.columns = train_features.columns
train_normalised.index = train_features.index

print("normalise test dataset...")
test_normalised = pd.DataFrame(scaler.transform(test_features))
test_normalised.columns = test_features.columns
test_normalised.index = test_features.index

train_normalised.head()

normalise train dataset...
normalise test dataset...


train_normalised[59::60].iplot(subplots=True, title="Train")

test_normalised[59::60].iplot(subplots=True, title="Test")


train_in_hours = train_features[59::60]

feature2normaliesd = pd.DataFrame({ 
    'Real': train_in_hours['close'], 
    'Normalised': train_normalised['close'][59::60]
})
feature2normaliesd.index = train_in_hours.index

feature2normaliesd.iplot(subplots=True)

	high	low	open	close	volume
timestamp
2016-08-10 15:53:00	579.000000	579.00	579.00	579.000000	1.000000
2016-08-10 15:54:00	604.750000	592.96	592.96	604.750000	131.838200
2016-08-10 15:55:00	604.750000	600.00	604.75	600.000000	89.437926
2016-08-10 15:57:00	604.750000	600.00	600.00	604.750000	51.328200
2016-08-10 15:58:00	604.750000	604.75	604.75	604.750000	4.586500
...	...	...	...	...	...
2020-04-22 12:37:00	6946.000000	6945.00	6945.90	6946.000000	0.130838
2020-04-22 12:38:00	6948.700000	6946.00	6946.00	6948.684645	0.951414
2020-04-22 12:39:00	6951.300000	6947.10	6948.70	6950.100000	0.055320
2020-04-22 12:40:00	6952.628959	6951.30	6951.30	6952.628959	0.168686
2020-04-22 12:41:00	6954.692469	6953.90	6953.90	6954.600000	0.046001

	high	low	open	close	volume
count	1816296.00	1816296.00	1816296.00	1816296.00	1816296.00
mean	6145.04	6138.47	6141.76	6141.75	17.74
std	3544.95	3538.94	3541.95	3541.94	52.61
min	563.24	562.99	563.10	563.00	0.00
25%	3588.77	3586.88	3587.70	3587.80	0.69
50%	6497.00	6494.50	6495.63	6495.60	3.94
75%	8481.90	8475.00	8479.08	8479.00	15.13
max	19891.00	19880.00	19890.00	19891.00	6717.52

	high	low	open	close	volume
count	454074.00	454074.00	454074.00	454074.00	454074.00
mean	18674.91	18652.66	18663.90	18663.96	5.55
std	12974.03	12947.02	12960.61	12960.68	23.19
min	6955.20	6952.80	6954.70	6955.04	0.00
25%	9649.30	9645.11	9647.40	9647.35	0.18
50%	11656.00	11651.75	11654.00	11654.00	0.97
75%	23178.00	23150.00	23165.00	23163.99	3.73
max	58321.24	58304.00	58317.00	58317.00	2982.69

Load dataset¶

Data featuring¶

Functions¶

MACD¶

Stochastics Oscillator¶

Average True Range¶

Check for normal distribution¶

Check time relation¶

Firstly define function for display frequiency¶

Frequency of price¶

Frequence of price change¶

Frequency of transaction volume¶

Frequence of transaction volume change¶

Compare train and test datasets¶

Training data exploration¶

Testing data exploration¶

Normalise data¶