-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel_selection.py
127 lines (100 loc) · 3.93 KB
/
model_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
""" Compare Bayesian models with the Bayes Factor
Examples
--------
>>> from numpy import arange, randn
>>> import models
>>> import model_selection
>>> X = randn(4000)
>>> Y = 2*X - 1 + randn(4000) # data actually is linear
>>> m1 = models.linear(X=X, Y=Y, order=1)
>>> m2 = models.linear(X=X, Y=Y, order=2)
>>> K = model_selection.bayes_factor(m1, m2)
>>> assert K >= 1
>>> assert K < 3
>>> Y = 2*X**2 - 1 + randn(1000) # data actually is quadratic
>>> m1 = models.linear(X=X, Y=Y, order=1)
>>> m2 = models.linear(X=X, Y=Y, order=2)
>>> K = model_selection.bayes_factor(m1, m2)
>>> assert K >= 100
>>> import data
>>> m1=models.linear(X=data.hdi, Y=data.tfr, order=2)
>>> m2=models.piecewise_linear(X=data.hdi, Y=data.tfr)
>>> model_selection.bayes_factor(m1, m2)
"""
from pymc import *
from numpy import exp, mean, log, array
def bayes_factor(m1, m2, iter=1e6, burn=25000, thin=10, verbose=0):
""" Approximate the Bayes factor as the harmonic mean posterior liklihood::
K = Pr[data | m2] / Pr[data | m1]
~= 1/mean(1/Pr[data_i, theta_i])
to compare 2 models. According to Wikipedia / Jefferies, the
interpretation of K is the following::
K < 1 - data supports m1
1 <= K < 3 - data supports m2, but is "Barely worth mentioning"
3 <= K < 10 - data gives "Substantial" support for m2
10 <= K < 30 - "Strong" support for m2
30 <= K < 100 - "Very strong" support for m2
K >= 100 - "Decisive" support for m2
Parameters
----------
m1 : object containing PyMC model vars and logp method
m2 : object containing PyMC model vars and logp method
m1 and m2 must each include a method called 'logp', which
calculates the posterior log probability at all MCMC samples
stored in the stochastic traces
iter: int, optional
number of iterations of MCMC
burn: int, optional
number of initial MCMC iters to discard
thin: int, optional
number of MCMC iters to discard per sample
verbose : int, optional
amount of output to request from PyMC
Results
-------
K : estimate of the bayes factor
Notes
-----
This sneaky harmonic mean of liklihood values appears in Kass and
Raftery (1995) where it is attributed to to Newton and Raftery
(1994)
"""
MCMC(m1).sample(iter*thin+burn, burn, thin, verbose=verbose)
logp1 = m1.logp()
MCMC(m2).sample(iter*thin+burn, burn, thin, verbose=verbose)
logp2 = m2.logp()
# pymc.flib.logsum suggested by Anand Patil, http://gist.github.com/179657
K = exp(pymc.flib.logsum(-logp1) - log(len(logp1))
- (pymc.flib.logsum(-logp2) - log(len(logp2))))
return K
class binomial_model:
def __init__(self, n=115, N=200, q=.5):
self.n = n
self.N = N
self.q = q
@potential
def data_potential(n=self.n, N=self.N,
q=self.q):
return binomial_like(n, N, q)
self.data_potential = data_potential
def logp(self):
from numpy import array
if isinstance(self.q, float):
return array([Model(self).logp])
elif isinstance(self.q, Variable) and isinstance(self.q.trace, bool): # trace is not a function until MCMC is run
return array([Model(self).logp])
logp = []
for q_val in self.q.trace():
self.q.value = q_val
logp.append(Model(self).logp)
return array(logp)
def wikipedia_example(iter=1e6, burn=25000, thin=10, verbose=1):
""" Based on the Wikipedia example, 115 heads from 200 coin
tosses, m1 = fair coin, m2 = coin with probability q, uniform
[0,1] prior on q.
>>> assert wikipedia_example() > 1.196
>>> assert wikipedia_example() < 1.198
"""
m1 = binomial_model(115, 200, .5)
m2 = binomial_model(115, 200, Uniform('q', 0., 1.))
return bayes_factor(m1, m2, iter=iter, burn=burn, thin=thin, verbose=verbose)