This is an implementation of the Lossy Counting Algorithm described in Manku and Motwani's 2002 paper "Approximate Frequency Counts over Data Streams".
It is an algorithm for estimate elements in a stream whose frequency count exceeds a threshold, while using only limited memory. For example for video view counts on something like YouTube, finding which videos that each constitute more than 3% of the views.
Please let me know if you spot any bugs.
from math import ceil
class LossyCount:
def __init__(self, max_error=0.005): # max_error is the parameter they call epsilon in the paper.
self.max_error = max_error
self.bucket_width = ceil(1/max_error)
self.entries = dict()
self.n = 0
def put(self, x):
self.n += 1
current_bucket = ceil(self.n / self.bucket_width)
freq, delta = 1, current_bucket-1
if x in self.entries:
freq, delta = self.entries[x]
freq += 1
self.entries[x] = (freq, delta)
# If at bucket boundary then prune low frequency entries.
if self.n % self.bucket_width == 0:
prune = []
for key in self.entries:
freq, delta = self.entries[key]
if freq + delta <= current_bucket:
prune.append(key)
for key in prune:
del self.entries[key]
def get(self, support_threshold=0.001): # support_threshold is the parameter they call s in the paper.
res = []
for key in self.entries:
freq, delta = self.entries[key]
if freq >= (support_threshold - self.max_error)*self.n:
res.append(key)
return res
# Generate test data.
from math import log
from random import random, randint
view_cnt = 500000
videos_cnt = 100000
x = [random() for _ in range(view_cnt)]
# y = [1/v for v in x] # This distribution is too steep...
y = [(1/v)*0.01 -log(v) for v in x] # A distribution that is reasonably steep and has a very long tail.
m = max(y)
y = [v/m for v in y]
# ids = [i for i in range(videos_cnt)] # Easy to read IDs, but unrealistic. Most popular video will have ID 0, second most popular ID 1, etc.
ids = [randint(1000000, 9000000) for _ in range(videos_cnt)] # More realistic video IDs.
idxs = [int(v*(videos_cnt-1)) for v in y]
views = [ids[v] for v in idxs]
# import matplotlib.pyplot as plt
# plt.hist(views, bins=200) # Only works when the IDs are 1,2,3,4...
# plt.show()
threshold = 0.03 # We are interested in videos that each constitute more than 3% of the views.
# Generate exact results using a counter, to compare with.
from collections import Counter
c = Counter(views)
r = []
for k in c:
r.append((c[k], k))
r2 = []
for cnt, id in r:
if cnt >= view_cnt*threshold:
r2.append(id)
print(sorted(r2))
# Test the LossyCount class. Should give similar (but not exact) results to the above.
lc = LossyCount()
for v in views:
lc.put(v)
print(sorted(lc.get(threshold)))
No comments:
Post a Comment