Posts
布隆过滤器

布隆过滤器

完全不记得当时为什么要学这个了 可能是心血来潮吧

原理参考的bloom filterbloomfilter-tutorial,讲的都十分清楚,用bitmap来记录 键的多组哈希编码, 由于位图的大小限制以及哈希算法的冲突,存在一定的错误率, 可通过增加位图空间大小以及替换均匀分布更好的哈希算法来降低错误率

实现参考的bloomd,以哈希算法的数量来划分位图, 将命中独立开来,然后哈希的选取可以以两种为基底, g_i(x) = h1(u) + i * h2(u) mod m, 组合成多种,证明以后有时间再看看吧

#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H
 
#include <stdio.h>
#include "hashlib.h"
 
typedef struct
{
	uint32_t n;
	uint32_t k;
	uint32_t size;
    uint32_t offset;
	unsigned char *bits;
} bloom_filter;
 
 
 
static inline int bf_getbit(bloom_filter *bf, uint32_t idx)
{
	return (bf->bits[idx >> 3] >> (7 - idx % 8)) & 0x1;
}
 
static inline void bf_setbit(bloom_filter *bf, uint32_t idx)
{
	unsigned char byte = bf->bits[idx >> 3];
	unsigned char offset = 7 - idx % 8;
	byte |= 1 << offset;
	bf->bits[idx >> 3] = byte;
}
 
bloom_filter *bf_init(uint32_t k, uint32_t nboxes);
void bf_hash(uint32_t *hashes, uint32_t k, char *key);
int bf_search(bloom_filter *bf, uint32_t *hashes);
 
int bf_add(bloom_filter *bf, char *key);
int bf_contain(bloom_filter *bf, char *key);
 
void free_bf(bloom_filter *bf);
void print_bf(bloom_filter *bf);
 
 
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
 
#include "bloom_filter.h"
    
bloom_filter *bf_init(uint32_t k, uint32_t nboxes)
{
    bloom_filter *bf;
    bf = malloc(sizeof *bf);
    // actual bytes + 1bit
    bf->bits = malloc(((nboxes >> 3) + 1) * sizeof *bf->bits);
 
    bf->n = 0; // num of element
    bf->k = k; // num of hash >=4
    bf->size = nboxes;
    bf->offset = nboxes / k; // nboxes divide into k block, offset represent the block size
    return bf;
}
 
int bf_search(bloom_filter *bf, uint32_t *hashes)
{
    uint32_t i, idx;
    for(i = 0; i < bf->k; i++)
    {
        idx = i * bf->offset + hashes[i] % bf->offset;
        if(bf_getbit(bf, idx))
            return 1;
    }
    return 0;
}
 
void bf_hash(uint32_t *hashes, uint32_t k, char *key)
{
    size_t len = strlen(key);
    hashes[0] = murmur3_32(key, len, 0);
    hashes[1] = fnv1a(key, len);
 
    for(uint32_t i = 2; i < k; i++)
        hashes[i] = hashes[1] + ((i + hashes[0]) % 18446744073709551557U);
}
 
int bf_add(bloom_filter *bf, char *key)
{
    uint32_t *hashes = alloca(bf->k * sizeof(uint32_t));	
    
    bf_hash(hashes, bf->k, key);
 
    if(bf_search(bf, hashes))
        return 0;
 
    uint32_t i, idx;
    for(i = 0; i < bf->k; i++)
    {
        idx = i * bf->offset + hashes[i] % bf->offset;	
        bf_setbit(bf, idx);
    }
    bf->n += 1;
    return 1;
}
 
int bf_contain(bloom_filter *bf, char *key)
{
    uint32_t *hashes;
    hashes = alloca(bf->k * sizeof *hashes);
    bf_hash(hashes, bf->k, key);
    return bf_search(bf, hashes);
}
 
void free_bf(bloom_filter *bf)
{
    free(bf->bits);
    free(bf);
}
 
void print_bf(bloom_filter *bf)
{
    printf("k: %u n: %u size: %u\n", bf->k, bf->n, bf->size);
    uint32_t n = (bf->size > 30) ? 30 : bf->size;
    for(uint32_t i = 0; i < n; i++)
    {
        if((i % 8) == 0)
            printf(" ");
        printf("%d", (bf->bits[i >> 3] >> (7 - i % 8)) & 0x1);
    }
    printf("\n");
    
}
 
 
int test_bf()
{
    bloom_filter *bf = bf_init(2, 20);
    print_bf(bf);
    bf_setbit(bf, 2);
    if(!bf_getbit(bf, 2))
        printf("bf_getbit fail!\n");
    bf_add(bf, "ASDJ");
    
    uint32_t *hs = malloc(2 * sizeof(uint32_t *));
    bf_hash(hs, 2, "ASDJ");
    bf_search(bf, hs);
    
    print_bf(bf);
    if(!bf_contain(bf, "ASDJ"))
        printf("add fail!\n");
    
    print_bf(bf);
    
    free(hs);
    free_bf(bf);
    
    getchar();
    return 0;
}
//
//k: 2 n: 0 size: 20
//00000000 00000000 0000
//k: 2 n: 1 size: 20
//00100001 00000000 0001
 

murmur3fnv1a参考了wiki , 关于hash原理相关的异或、rotate印度老哥讲得挺明白的

#ifndef HASH_H
#define HASH_H
 
static inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
	  return (x << r) | (x >> (32 - r));
}
 
static inline uint32_t fmix(uint32_t h)
{
	h ^= h >> 16;
	h *= 0x85ebca6b;
	h ^= h >> 13;
	h *= 0xc2b2ae35;
	h ^= h >> 16;
	return h;
}
 
uint32_t murmur3_32(const void *key, size_t len, uint32_t seed);
 
uint32_t fnv1a(const void *key, size_t len); 
 
 
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
 
#include "hashlib.h"
 
static uint32_t murmur32_scramble(uint32_t k)
{
    k *= 0xcc9e2d51;
    k = rotl32(k, 15);
    k *= 0x1b873593;
    return k;
}
 
uint32_t murmur3_32(const void *key, size_t len, uint32_t seed)
{
    uint8_t *data = (uint8_t *)key;
    uint32_t h = seed;
    uint32_t k;
    
    for(size_t i = len >> 2; i; i--)
    {
        k = *((uint32_t *)data);
        data += sizeof(uint32_t);
 
        h ^= murmur32_scramble(k);
        h = rotl32(h, 13);
        h = h * 5 + 0xe6546b64;
    }
 
    k = 0;
    for(size_t i = len & 3; i; i--)
    {
        k <<= 8;
        k |= data[i - 1];
    }
 
    h ^= murmur32_scramble(k);
    h ^= len;
    
    h = fmix(h);
    return h;
}
 
uint32_t fnv1a(const void *key, size_t len)
{
    uint8_t *data = (uint8_t *)key;
 
    uint32_t h = 0x811c9dc5;  // offset
    uint32_t k;
 
    for(size_t i = 0; i < len; i++)
    {
        k = *data;
        data += sizeof *data;
        
        h *= 0x01000193; // prime
        h ^= k;
    }
    return h;
}
 
    
// murmur 				output
// // A  S  D  J    %u
// // 65 83 68 74  3398122917
//
// k val first loop
// 1245991745
// // 0b100 1010 0100 0100 0101 0011 0100 0001
//
// fnv1a 				output
// // A  S  D  J    %u	
// // 65 83 68 74  60015639  
 

对大规模的去重,则要考虑哈希摘要的位数了 cpp的实现版本libbf, 还没细看 python-bloomfilter, 内容不是很多, 基于bitarray位图模块,可以看看pythonc库实现 Scrapy_Redis_Bloomfilter