寻找最大的k个数，TopK问题的C++实现

FXMAR · 发表于 2015-11-23 15:37:39

　　2亿个整数中求最大的100万之和
　　题目：有一个文件中保存了2亿个整数，每个整数都以' '分隔。求最大的100万个整数之和。
　　算法：

1. 首先建立一个容量为100万(nTop)的int数组，从文件读取整数填充。

2. 利用堆维护该100万条记录(确保堆顶元素为最小值)

3. 从文件中读取一个整数与堆顶元素比较，如果大于堆顶元素则替换该元素，并调整堆的结构。

4. 重复步骤3一直到数据读取完

5. 将数组中的元素全部相加，得到结果
　　参考源代码:

#include <iostream>
using namespace std;
template<class T>
class CTopK
{
public:
CTopK();
~CTopK();
T*  m_Data;
int GetTopK(const char* sFile, int& nTop);
private:
void Clear();
void HeapAdjust(int nStart, int nLen);
void BuildHeap(int nLen);
};
template<class T>
CTopK<T>::CTopK()
{
m_Data = NULL;
}
template<class T>
CTopK<T>::~CTopK()
{
Clear();
}
template<class T>
void CTopK<T>::Clear()
{
if (NULL != m_Data)
{
delete[] m_Data;
m_Data = NULL;
}
}
//获取前top的k个数
template<class T>
int CTopK<T>::GetTopK(const char* sFile, int& nTop)
{
FILE* fp = NULL;
T fData;
int i = 0;
//判断输入参数
if ( (NULL == sFile) || (nTop <= 0) )
{
cout << "error parameter" << endl;
return -1;
}
//清除操作
Clear();
//打开文件
fp = fopen(sFile, "r");
if (NULL == fp)
{
cout << "open file failed!" << endl;
return -1;
}
//分配空间
m_Data = new T[nTop];
if (NULL == m_Data)
{
cout << "new operator failed!" << endl;
return -1;
}
cout << "please wait..." << endl;
//读取前nTop个数据,注意数据的类型T
for (i = 0; i < nTop; i++)
{
if (EOF != fscanf(fp, "%d", &fData))
{
m_Data = fData;
}
else
{
break;
}
}
//最大的个数小于nTop,求前i个数据
if (i < nTop)
{
nTop = i;
}
else
{
BuildHeap(nTop);//建立小顶堆
while (EOF != fscanf(fp, "%d", &fData))
{
if (fData > m_Data[0])
{
//交换并调整堆
m_Data[0] = fData;
HeapAdjust(0, nTop);
}
}
}
return 0;
}
//调整小根堆，堆顶为Top K最小
template<class T>
void CTopK<T>::HeapAdjust(int nStart, int nLen)
{
int nMinChild = 0;
T fTemp;
while ( ( 2 * nStart + 1) < nLen)
{
nMinChild = 2 * nStart + 1;
if ( (2 * nStart + 2) < nLen)
{
//比较左子树和右子树,记录最小值的Index
if (m_Data[2 * nStart + 2] < m_Data[2 * nStart + 1])
{
nMinChild = 2 * nStart + 2;
}
}
//change data
if (m_Data[nStart] > m_Data[nMinChild])
{
//交换nStart与nMaxChild的数据
fTemp = m_Data[nStart];
m_Data[nStart] = m_Data[nMinChild];
m_Data[nMinChild] = fTemp;
//堆被破坏,需要重新调整
nStart = nMinChild;
}
else
{
//比较左右孩子均大则堆未破坏,不再需要调整
break;
}
}
}
//建立堆
template<class T>
void CTopK<T>::BuildHeap(int nLen)
{
int i = 0;
T nTemp;
//将m_Data[0, Len-1]建成小根堆，这里只维护一个小根堆，不排序
for (i = nLen / 2  - 1; i >= 0; i--)
{
HeapAdjust(i, nLen);
}
}
int main(int argc, char* argv[])
{
char szFile[100] = {0};
int    nNum = 0;
CTopK<int> objTopSum;
cout << "please input count file name:" << endl;
cin >> szFile;
cout << "please input top num:"<< endl;
cin >> nNum;
objTopSum.GetTopK(szFile, nNum);
int fSum = 0;
for (int i = 0; i < nNum; i++)
{
cout<<objTopSum.m_Data<<" ";
fSum += objTopSum.m_Data;
}
cout << "\ntop " << nNum << " value = " << fSum << endl;
return 0;
}
　　
　　搜索引擎热门查询统计
　　题目描述：

搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。

假设目前有一千万个记录，这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门。请你统计最热门的10个查询串，要求使用的内存不能超过1G。
　　解决方法：hash表+堆
　　第一步、先对这批海量数据预处理，在O（N）的时间内用Hash表完成统计；

第二步、借助堆这个数据结构，找出Top K，时间复杂度为NlogK。即借助堆结构，我们可以在log量级的时间内查找和调整/移动。因此，维护一个K(该题目中是10)大小的小根堆（Kmin设为堆顶元素），然后遍历300万的Query，分别和Kmin进行对比比较（若X>Kmin，则更新并调整堆，否则，不更新），我们最终的时间复杂度是：O（N） + N'*O（logK），（N为1000万，N’为300万）。
　　为了降低实现上的难度，假设这些记录全部是一些英文单词，即用户在搜索框里敲入一个英文单词，然后查询搜索结果，最后，要你统计输入单词中频率最大的前K个单词。复杂问题简单化了之后，编写代码实现也相对轻松多了，如下：

//copyright@yansha &&July
//July、updated，2011.05.08
//题目描述：
//搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的
//长度为1-255字节。假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但如果
//除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门），
//请你统计最热门的10个查询串，要求使用的内存不能超过1G。
#include <iostream>
#include <string>
#include <assert.h>
using namespace std;
#define HASHLEN 2807303
#define WORDLEN 30
// 结点指针
typedef struct node_no_space *ptr_no_space;
typedef struct node_has_space *ptr_has_space;
ptr_no_space head[HASHLEN];
struct node_no_space
{
char *word;
int count;
ptr_no_space next;
};
struct node_has_space
{
char word[WORDLEN];
int count;
ptr_has_space next;
};
// 最简单hash函数
int hash_function(const char *p)
{
int value = 0;
while (*p != '\0')
{
value = value * 31 + *p++;
if (value > HASHLEN)
value = value % HASHLEN;
}
return value;
}
// 添加单词到hash表
void append_word(const char *str)
{
int index = hash_function(str);
ptr_no_space p = head[index];
while (p != NULL)
{
if (strcmp(str, p->word) == 0)
{
(p->count)++;
return;
}
p = p->next;
}
// 新建一个结点
ptr_no_space q = new node_no_space;
q->count = 1;
q->word = new char [strlen(str)+1];
strcpy(q->word, str);
q->next = head[index];
head[index] = q;
}

// 将单词处理结果写入文件
void write_to_file()
{
FILE *fp = fopen("result.txt", "w");
assert(fp);
int i = 0;
while (i < HASHLEN)
{
for (ptr_no_space p = head; p != NULL; p = p->next)
fprintf(fp, "%s  %d\n", p->word, p->count);
i++;
}
fclose(fp);
}
// 从上往下筛选，保持小根堆
void sift_down(node_has_space heap[], int i, int len)
{
int min_index = -1;
int left = 2 * i;
int right = 2 * i + 1;
if (left <= len && heap[left].count < heap.count)
min_index = left;
else
min_index = i;
if (right <= len && heap[right].count < heap[min_index].count)
min_index = right;
if (min_index != i)
{
// 交换结点元素
swap(heap.count, heap[min_index].count);
char buffer[WORDLEN];
strcpy(buffer, heap.word);
strcpy(heap.word, heap[min_index].word);
strcpy(heap[min_index].word, buffer);
sift_down(heap, min_index, len);
}
}
// 建立小根堆
void build_min_heap(node_has_space heap[], int len)
{
if (heap == NULL)
return;
int index = len / 2;
for (int i = index; i >= 1; i--)
sift_down(heap, i, len);
}
// 去除字符串前后符号
void handle_symbol(char *str, int n)
{
while (str[n] < '0' || (str[n] > '9' && str[n] < 'A') || (str[n] > 'Z' && str[n] < 'a') || str[n] > 'z')
{
str[n] = '\0';
n--;
}
while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z')
{
int i = 0;
while (i < n)
{
str = str[i+1];
i++;
}
str = '\0';
n--;
}
}
int main()
{
char str[WORDLEN];
for (int i = 0; i < HASHLEN; i++)
head = NULL;
// 将字符串用hash函数转换成一个整数并统计出现频率
FILE *fp_passage = fopen("string.txt", "r");
assert(fp_passage);
while (fscanf(fp_passage, "%s", str) != EOF)
{
int n = strlen(str) - 1;
if (n > 0)
handle_symbol(str, n);
append_word(str);
}
fclose(fp_passage);
// 将统计结果输入文件
write_to_file();
int n = 10;
ptr_has_space heap = new node_has_space [n+1];
int c;
FILE *fp_word = fopen("result.txt", "r");
assert(fp_word);
for (int j = 1; j <= n; j++)
{
fscanf(fp_word, "%s %d", &str, &c);
heap[j].count = c;
strcpy(heap[j].word, str);
}
// 建立小根堆
build_min_heap(heap, n);
// 查找出现频率最大的10个单词
while (fscanf(fp_word, "%s %d", &str, &c) != EOF)
{
if (c > heap[1].count)
{
heap[1].count = c;
strcpy(heap[1].word, str);
sift_down(heap, 1, n);
}
}
fclose(fp_word);
// 输出出现频率最大的单词
for (int k = 1; k <= n; k++)
cout << heap[k].count << " " << heap[k].word << endl;
return 0;
}
　　
　　参考：
　　http://blog.iyunv.com/andylin02/archive/2008/11/28/3401123.aspx
　　http://blog.iyunv.com/v_JULY_v/archive/2011/05/08/6403777.aspx
　　

账号		自动登录	找回密码
密码			立即注册

VMware vcenter+vSphere 6.5 U2共享

【跟谁学】韩宇极简英语课-技术人员不得不

用Zabbix通过JMX方式监控weblogic

winhex数据恢复教程（非常巨大，内容丰富）

Symantec Backup Exec 2015 2016/2012 BE20

NetScaler VPX部署之：NetScaler Gateway调

zabbix3.4.1安装部署+微信推送信息+大屏显

[经验分享] 寻找最大的k个数，TopK问题的C++实现

扫码加入运维网微信交流群