-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathalpha_selection.py
345 lines (262 loc) · 12.1 KB
/
alpha_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import Latest
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.factors import CustomFactor, SimpleMovingAverage, AverageDollarVolume, Returns, RSI
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline.filters import Q500US, Q1500US
from quantopian.pipeline.data.quandl import fred_usdontd156n as libor
from quantopian.pipeline.data.zacks import EarningsSurprises
from quantopian.pipeline.data.sentdex import sentiment
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
import talib
import pandas as pd
import numpy as np
from time import time
import alphalens as al
import pyfolio as pf
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition, ensemble, preprocessing, isotonic, metrics
bs = morningstar.balance_sheet
cfs = morningstar.cash_flow_statement
is_ = morningstar.income_statement
or_ = morningstar.operation_ratios
er = morningstar.earnings_report
v = morningstar.valuation
vr = morningstar.valuation_ratios
def make_factors():
class Momentum(CustomFactor):
inputs = [USEquityPricing.close]
window_length = 252
def compute(self, today, assets, out, prices):
out[:] = ((prices[-21] - prices[-252])/prices[-252] -
(prices[-1] - prices[-21])/prices[-21])
def Growth():
return or_.revenue_growth.latest
def PE_ratio():
return vr.pe_ratio.latest
def Sentiment():
return sentiment.sentiment_signal.latest
def Asset_Growth_3M():
return Returns(inputs=[bs.total_assets], window_length=63)
def Asset_To_Equity_Ratio():
return bs.total_assets.latest / bs.common_stock_equity.latest
def Capex_To_Cashflows():
return (cfs.capital_expenditure.latest * 4.) / \
(cfs.free_cash_flow.latest * 4.)
def EBITDA_Yield():
return (is_.ebitda.latest * 4.) / \
USEquityPricing.close.latest
def EBIT_To_Assets():
return (is_.ebit.latest * 4.) / \
bs.total_assets.latest
def Earnings_Quality():
return morningstar.cash_flow_statement.operating_cash_flow.latest / \
EarningsSurprises.eps_act.latest
def Return_On_Total_Invest_Capital():
return or_.roic.latest
class Mean_Reversion_1M(CustomFactor):
inputs = [Returns(window_length=21)]
window_length = 252
def compute(self, today, assets, out, monthly_rets):
out[:] = (monthly_rets[-1] - np.nanmean(monthly_rets, axis=0)) / \
np.nanstd(monthly_rets, axis=0)
class MACD_Signal_10d(CustomFactor):
inputs = [USEquityPricing.close]
window_length = 60
def compute(self, today, assets, out, close):
sig_lines = []
for col in close.T:
# get signal line only
try:
_, signal_line, _ = talib.MACD(col, fastperiod=12,
slowperiod=26, signalperiod=10)
sig_lines.append(signal_line[-1])
# if error calculating, return NaN
except:
sig_lines.append(np.nan)
out[:] = sig_lines
class Moneyflow_Volume_5d(CustomFactor):
inputs = [USEquityPricing.close, USEquityPricing.volume]
window_length = 5
def compute(self, today, assets, out, close, volume):
mfvs = []
for col_c, col_v in zip(close.T, volume.T):
# denominator
denominator = np.dot(col_c, col_v)
# numerator
numerator = 0.
for n, price in enumerate(col_c.tolist()):
if price > col_c[n - 1]:
numerator += price * col_v[n]
else:
numerator -= price * col_v[n]
mfvs.append(numerator / denominator)
out[:] = mfvs
def Net_Income_Margin():
return or_.net_margin.latest
def Operating_Cashflows_To_Assets():
return (cfs.operating_cash_flow.latest * 4.) / \
bs.total_assets.latest
def Price_Momentum_3M():
return Returns(window_length=63)
class Price_Oscillator(CustomFactor):
inputs = [USEquityPricing.close]
window_length = 252
def compute(self, today, assets, out, close):
four_week_period = close[-20:]
out[:] = (np.nanmean(four_week_period, axis=0) /
np.nanmean(close, axis=0)) - 1.
def Returns_39W():
return Returns(window_length=215)
class Trendline(CustomFactor):
inputs = [USEquityPricing.close]
window_length = 252
# using MLE for speed
def compute(self, today, assets, out, close):
# prepare X matrix (x_is - x_bar)
X = range(self.window_length)
X_bar = np.nanmean(X)
X_vector = X - X_bar
X_matrix = np.tile(X_vector, (len(close.T), 1)).T
# prepare Y matrix (y_is - y_bar)
Y_bar = np.nanmean(close, axis=0)
Y_bars = np.tile(Y_bar, (self.window_length, 1))
Y_matrix = close - Y_bars
# prepare variance of X
X_var = np.nanvar(X)
# multiply X matrix an Y matrix and sum (dot product)
# then divide by variance of X
# this gives the MLE of Beta
out[:] = (np.sum((X_matrix * Y_matrix), axis=0) / X_var) / \
(self.window_length)
class Vol_3M(CustomFactor):
inputs = [Returns(window_length=2)]
window_length = 63
def compute(self, today, assets, out, rets):
out[:] = np.nanstd(rets, axis=0)
def Working_Capital_To_Assets():
return bs.working_capital.latest / bs.total_assets.latest
all_factors = {
'Momentum' : Momentum,
'Growth' : Growth,
'PE ratio' : PE_ratio,
'Sentiment' : Sentiment,
'Asset Growth 3M': Asset_Growth_3M,
'Asset to Equity Ratio': Asset_To_Equity_Ratio,
'Capex to Cashflows': Capex_To_Cashflows,
'EBIT to Assets': EBIT_To_Assets,
'EBITDA Yield': EBITDA_Yield,
'Earnings Quality': Earnings_Quality,
'MACD Signal Line': MACD_Signal_10d,
'Mean Reversion 1M': Mean_Reversion_1M,
'Moneyflow Volume 5D': Moneyflow_Volume_5d,
'Net Income Margin': Net_Income_Margin,
'Operating Cashflows to Assets': Operating_Cashflows_To_Assets,
'Price Momentum 3M': Price_Momentum_3M,
'Price Oscillator': Price_Oscillator,
'Return on Invest Capital': Return_On_Total_Invest_Capital,
'39 Week Returns': Returns_39W,
'Trendline': Trendline,
'Vol 3M': Vol_3M,
'Working Capital to Assets': Working_Capital_To_Assets,
}
return all_factors
universe = Q1500US()
factors = make_factors()
n_fwd_days = 5 # number of days to compute returns over
def make_history_pipeline(factors, universe, n_fwd_days=5):
# Call .rank() on all factors and mask out the universe
factor_ranks = {name: f().rank(mask=universe) for name, f in factors.iteritems()}
# Get cumulative returns over last n_fwd_days days. We will later shift these.
factor_ranks['Returns'] = Returns(inputs=[USEquityPricing.open],
mask=universe, window_length=n_fwd_days)
pipe = Pipeline(screen=universe, columns=factor_ranks)
return pipe
history_pipe = make_history_pipeline(factors, universe, n_fwd_days=n_fwd_days)
start_timer = time()
start = pd.Timestamp("2016-01-01")
end = pd.Timestamp("2017-06-01")
results = run_pipeline(history_pipe, start_date=start, end_date=end)
results.index.names = ['date', 'security']
end_timer = time()
print "Time to run pipeline %.2f secs" % (end_timer - start_timer)
results.head()
def shift_mask_data(X, Y, upper_percentile=70, lower_percentile=30, n_fwd_days=1):
# Shift X to match factors at t to returns at t+n_fwd_days (we want to predict future returns after all)
shifted_X = np.roll(X, n_fwd_days+1, axis=0)
# Slice off rolled elements
X = shifted_X[n_fwd_days+1:]
Y = Y[n_fwd_days+1:]
n_time, n_stocks, n_factors = X.shape
# Look for biggest up and down movers
upper = np.nanpercentile(Y, upper_percentile, axis=1)[:, np.newaxis]
lower = np.nanpercentile(Y, lower_percentile, axis=1)[:, np.newaxis]
upper_mask = (Y >= upper)
lower_mask = (Y <= lower)
mask = upper_mask | lower_mask # This also drops nans
mask = mask.flatten()
# Only try to predict whether a stock moved up/down relative to other stocks
Y_binary = np.zeros(n_time * n_stocks)
Y_binary[upper_mask.flatten()] = 1
Y_binary[lower_mask.flatten()] = -1
# Flatten X
X = X.reshape((n_time * n_stocks, n_factors))
# Drop stocks that did not move much (i.e. are in the 30th to 70th percentile)
X = X[mask]
Y_binary = Y_binary[mask]
return X, Y_binary
# Massage data to be in the form expected by shift_mask_data()
results_wo_returns = results.copy()
returns = results_wo_returns.pop('Returns')
Y = returns.unstack().values
X = results_wo_returns.to_panel()
X = X.swapaxes(2, 0).swapaxes(0, 1).values # (factors, time, stocks) -> (time, stocks, factors)
results_wo_returns.index = results_wo_returns.index.set_levels(results_wo_returns.index.get_level_values(1).map(lambda x: x.symbol), 1, )
results_wo_returns.index = results_wo_returns.index.set_levels(results_wo_returns.index.get_level_values(0).map(lambda x: x.date), 0, )
results_wo_returns.sample(10).sort()
tmp = (returns > 0.).to_frame()
tmp.index = tmp.index.set_levels(tmp.index.get_level_values(1).map(lambda x: x.symbol), 1)
tmp.columns = ['5-day forward returns > 0']
tmp.sample(10).sort()
results_wo_returns.isnull().sum()
# Train-test split
train_size_perc = 0.8
n_time, n_stocks, n_factors = X.shape
train_size = np.int16(np.round(train_size_perc * n_time))
X_train, Y_train = X[:train_size, ...], Y[:train_size]
X_test, Y_test = X[(train_size+n_fwd_days):, ...], Y[(train_size+n_fwd_days):]
X_train_shift, Y_train_shift = shift_mask_data(X_train, Y_train, n_fwd_days=n_fwd_days)
X_test_shift, Y_test_shift = shift_mask_data(X_test, Y_test, n_fwd_days=n_fwd_days,
lower_percentile=50,
upper_percentile=50)
X_train_shift.shape, X_test_shift.shape
start_timer = time()
# Train classifier
imputer = preprocessing.Imputer()
scaler = preprocessing.MinMaxScaler()
clf = ensemble.AdaBoostClassifier(n_estimators=150) # n_estimators controls how many weak classifiers are fi
X_train_trans = imputer.fit_transform(X_train_shift)
X_train_trans = scaler.fit_transform(X_train_trans)
clf.fit(X_train_trans, Y_train_shift)
end_timer = time()
print "Time to train full ML pipline: %0.2f secs" % (end_timer - start_timer)
Y_pred = clf.predict(X_train_trans)
print('Accuracy on train set = {:.2f}%'.format(metrics.accuracy_score(Y_train_shift, Y_pred) * 100))
# Transform test data
X_test_trans = imputer.transform(X_test_shift)
X_test_trans = scaler.transform(X_test_trans)
# Predict
Y_pred = clf.predict(X_test_trans)
Y_pred_prob = clf.predict_proba(X_test_trans)
print 'Predictions:', Y_pred
print 'Probabilities of class == 1:', Y_pred_prob[:, 1] * 100
print('Accuracy on test set = {:.2f}%'.format(metrics.accuracy_score(Y_test_shift, Y_pred) * 100))
print('Log-loss = {:.5f}'.format(metrics.log_loss(Y_test_shift, Y_pred_prob)))
feature_importances = pd.Series(clf.feature_importances_, index=results_wo_returns.columns)
feature_importances.sort(ascending=False)
ax = feature_importances.plot(kind='bar')
ax.set(ylabel='Importance (Gini Coefficient)', title='Feature importances')