Skip to content

Commit

Permalink
add (broken) optimization stuff; uneven distribution
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicolas Weber committed Feb 11, 2009
1 parent a492649 commit 2d1bb20
Showing 1 changed file with 44 additions and 7 deletions.
51 changes: 44 additions & 7 deletions numpredict.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@ def wineprice(rating, age):
return max(0, price)


def wineset1():
def wineset1(k=300):
rows = []
for i in range(300):
for i in range(k):
rating = random.random()*50 + 50
age = random.random() * 50
price = wineprice(rating, age) * (random.random()*0.4 + 0.8)
rows.append({'input': (rating, age), 'result': price})
return rows


def wineset2():
def wineset2(k=300):
rows = []
for i in range(300):
for i in range(k):
rating = random.random()*50 + 50
age = random.random() * 50
aisle = float(random.randint(1, 20))
Expand All @@ -42,6 +42,14 @@ def wineset2():
return rows


def wineset3(k=300):
rows = wineset1(k)
for row in rows:
if random.random() < 0.5:
row['result'] *= 0.6 # "discount price", to simulate uneven distribution
return rows


euclidean = clusters.euclid_dist


Expand Down Expand Up @@ -91,7 +99,15 @@ def partition(l, pred):


def dividedata(data, pTest=0.05):
return partition(data, lambda r: random.random() < pTest)
# The approach used in the book doesn't has some variation in the size
# of the testset (sometimes the test set has 0 elements!), so use a
# different approach instead
#return partition(data, lambda r: random.random() < pTest)
n = len(data)
nTest = int(n*pTest)
shuffledData = data[:]
random.shuffle(shuffledData)
return shuffledData[0:n-nTest], shuffledData[n-nTest:n]


def testalgorithm(algfun, trainset, testset):
Expand All @@ -118,8 +134,16 @@ def rescale(data, scale):
return scaledata


def createcostfunction(algfun, data):
def costf(scale):
print scale
sdata = rescale(data, scale)
return crossvalidate(algfun, sdata, trials=100)
return costf


if __name__ == '__main__':
s = wineset1()
s = wineset1(50)

print knnestimate(s, (95.0, 3.0), k=1)
print knnestimate(s, (95.0, 3.0), k=3)
Expand All @@ -133,6 +157,19 @@ def rescale(data, scale):
print crossvalidate(lambda d, v: knnestimate(d, v, k=7), s)
print crossvalidate(lambda d, v: weightedknn(d, v, k=5), s)

s = wineset2()
print
print 'set 2, not-to-scale parameters (XXX buggy, broken, incomplete)'
s = wineset2(50)
print crossvalidate(knnestimate, s)
print crossvalidate(knnestimate, rescale(s, [10, 10, 0, 0.5]))

# automatically figure out rescaling parameters. This runs forever.
# And my optimization code might be broken, this recomputes the same
# values over and over again. I should cache them. And fix broken stuff. (XXX)
#import optimization
#print optimization.annealingoptimize([(0, 20)] * 4,
#createcostfunction(knnestimate, s), step=2)

print
print 'set 3, uneven distribution'
s = wineset3()

0 comments on commit 2d1bb20

Please sign in to comment.