设为首页 收藏本站
查看: 882|回复: 0

[经验分享] C++ vs Python向量运算速度评测

[复制链接]

尚未签到

发表于 2015-11-29 15:51:28 | 显示全部楼层 |阅读模式
  本文的起源来自最近一个让我非常不爽的事。
  我最近在改一个开源RNN工具包currennt(http://sourceforge.net/projects/currennt/),想用它实现RNNLM功能。
  currennt使用了大量的面向对象的编程技巧,可以使用GPU,向量运算使用了thrust库(https://code.google.com/p/thrust/)。
  RNNLM(http://rnnlm.org/)也有相应开源实现,非常算法风格的代码,向量运算就是自己使用数组实现的。
  结果……大出我的语料,在不使用GPU的情况下,currennt慢成狗!我不断的修改,直到最后几乎完全在currennt里重写了一个RNNLM……速度才终于一致了。这花费了我大量时间,最关键的是我根本没打算花这些时间,算是计划外开销。
  所以这里干脆对常用的几种向量运算做个评测,下回遇到至少心里有数。

  参与评测的向量实现包括:


  • C++ array
  • C++ STL vector
  • C++ thrust(CPU)
  • C++ thrust(GPU)
  • python
  • python numpy
  评测指标包括:


  • 创建、填充向量
  • 向量点乘,相乘
  • 矩阵相乘
  测试环境:
  Intel Xeon CPU E5649@2.53GHz x24
  VS2010
  python 2.7.6 (32bit)
  thrust v1.5
  numpy 1.8.1

  C++ array
  创建全0向量:0.000s,几乎不占用时间



int vector_size=100000000;
float* vector=(float*)calloc(vector_size,sizeof(float));
  创建+填充向量:0.140s



int vector_size=100000000;
float* vector=(float*)calloc(vector_size,sizeof(float));
for (int i=0;i<vector_size;++i){
vector=0.01;
}
  向量点乘:0.390s



float sum=0;
for(int i=0;i<vector_size;++i){
sum+=vector1*vector2;
}
  向量相乘:0.265s



float sum=0;
for(int i=0;i<vector_size;++i){
vector3=vector1*vector2;
}

  矩阵乘向量:0.344s



int matrix1_colnum=50000;
int matrix1_rownum=2000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
float* vector1=(float*)calloc(matrix1_size,sizeof(float));
for (int i=0;i<matrix1_size;++i){
vector1=0.01;
}
float* vector2=(float*)calloc(matrix1_colnum,sizeof(float));
for (int i=0;i<matrix1_colnum;++i){
vector2=0.02;
}
start_t=clock();
float* vector3=(float*)calloc(matrix1_rownum,sizeof(float));
for(int row=0;row<matrix1_rownum;++row){
for(int col=0;col<matrix1_colnum;++col){
vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
}
}
end_t=clock();

  矩阵乘矩阵:0.749
  (耗费时间与matrix1_rownum*matrix1_colnum*matrix2_colnum成正比)



int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
float* vector1=(float*)calloc(matrix1_size,sizeof(float));
for (int i=0;i<matrix1_size;++i){
vector1=0.01;
}
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
float* vector2=(float*)calloc(matrix2_size,sizeof(float));
for (int i=0;i<matrix2_size;++i){
vector2=0.02;
}
int matrix3_size=matrix1_rownum*matrix2_colnum;
float* vector3=(float*)calloc(matrix3_size,sizeof(float));
start_t=clock();
for(int row1=0;row1<matrix1_rownum;++row1){
for(int col2=0;col2<matrix2_colnum;++col2){
for(int col1=0;col1<matrix1_colnum;++col1){
vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
}
}
}
end_t=clock();

  C++ STL vector
  创建全0向量:0.140s



int vect_size=100000000;
vector<float> vector(vect_size);

  创建+填充向量:0.140s



int vect_size=100000000;
vector<float> vector(vect_size,0.01);

  向量点乘:0.375s



int vect_size=100000000;
vector<float> vector1(vect_size,0.01);
vector<float> vector2(vect_size,0.02);
start_t=clock();
float sum=0;
for(int i=0;i<vect_size;++i){
sum+=vector1*vector2;
}
end_t=clock();

  向量相乘:0.250s



int vect_size=100000000;
vector<float> vector1(vect_size,0.01);
vector<float> vector2(vect_size,0.02);
vector<float> vector3(vect_size);
start_t=clock();
for(int i=0;i<vect_size;++i){
vector3=vector1*vector2;
}
end_t=clock();

  矩阵乘向量:0.390s



int matrix1_colnum=50000;
int matrix1_rownum=2000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
vector<float> vector1(matrix1_size,0.01);
vector<float> vector2(matrix1_colnum,0.02);
vector<float> vector3(matrix1_rownum);
start_t=clock();
for(int row=0;row<matrix1_rownum;++row){
for(int col=0;col<matrix1_colnum;++col){
vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
}
}
end_t=clock();

  矩阵乘法:0.827s



int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
vector<float> vector1(matrix1_size,0.01);
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
vector<float> vector2(matrix2_size,0.02);
int matrix3_size=matrix1_rownum*matrix2_colnum;
vector<float> vector3(matrix3_size);
start_t=clock();
for(int row1=0;row1<matrix1_rownum;++row1){
for(int col2=0;col2<matrix2_colnum;++col2){
for(int col1=0;col1<matrix1_colnum;++col1){
vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
}
}
}
end_t=clock();

  C++ thrust(CPU)
  创建全0向量:0.140s



int vect_size=100000000;
thrust::host_vector<float> vector1(vect_size);

  创建+填充向量:0.140s



int vect_size=100000000;
thrust::host_vector<float> vector1(vect_size,0.01);

  填充向量:0.078s



thrust::fill(vector1.begin(),vector1.end(),0.01);

  向量点乘:0.359s



int vect_size=100000000;
thrust::host_vector<float> vector1(vect_size,(float)0.1);
thrust::host_vector<float> vector2(vect_size,(float)0.2);
thrust::host_vector<float> vector3(vect_size,(float)0.2);
start_t=clock();
thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>());
end_t=clock();

  向量相乘:0.187s



int vect_size=100000000;
thrust::host_vector<float> vector1(vect_size,(float)0.1);
thrust::host_vector<float> vector2(vect_size,(float)0.2);
thrust::host_vector<float> vector3(vect_size);
start_t=clock();
thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
end_t=clock();

  矩阵乘向量:0.110s



struct matrixXvect_func
{
thrust::host_vector<float>* matrix;
thrust::host_vector<float>* vector;
int matrix_rownum;
int matrix_colnum;
__host__ __device__
float operator()(const int& idx) const{
float t=0;
for(int col=0;col<matrix_colnum;++col){
t+=(*matrix)[idx*matrix_colnum+col]* (*vector)[col];
}
return t;
}
};
int matrix1_rownum=2000;
int matrix1_colnum=50000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::host_vector<float> vector1(matrix1_size,(float)0.1);
thrust::host_vector<float> vector2(matrix1_colnum,(float)0.2);
thrust::host_vector<float> vector3(matrix1_rownum);
start_t=clock();
matrixXvect_func fn;
fn.matrix=&vector1;
fn.vector=&vector2;
fn.matrix_rownum=matrix1_rownum;
fn.matrix_colnum=matrix1_colnum;
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(0) + matrix1_rownum,
vector3.begin(),
fn
);
end_t=clock();

  矩阵乘矩阵:0.655s



struct matrixXmatrix_func
{
thrust::host_vector<float>* matrix1;
thrust::host_vector<float>* matrix2;
int matrix1_rownum;
int matrix1_colnum;
int matrix2_rownum;
int matrix2_colnum;
__host__ __device__
float operator()(const int& idx) const{
int rownum=idx/matrix2_colnum;
int colnum=idx%matrix2_colnum;
float t=0;
for(int col=0;col<matrix1_colnum;++col){
t+=(*matrix1)[rownum*matrix1_colnum+col]* (*matrix2)[col*matrix2_colnum+colnum];
}
return t;
}
};
int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::host_vector<float> vector1(matrix1_size,(float)0.1);
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
thrust::host_vector<float> vector2(matrix2_size,(float)0.2);
int matrix3_size=matrix1_rownum*matrix2_colnum;
thrust::host_vector<float> vector3(matrix3_size);
start_t=clock();
matrixXmatrix_func fn;
fn.matrix1=&vector1;
fn.matrix2=&vector2;
fn.matrix1_rownum=matrix1_rownum;
fn.matrix1_colnum=matrix1_colnum;
fn.matrix2_rownum=matrix2_rownum;
fn.matrix2_colnum=matrix2_colnum;
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(0) + matrix3_size,
vector3.begin(),
fn
);
end_t=clock();

  C++ thrust(GPU)
  创建全0向量:0.140s
  



int vect_size=1000000;
thrust::device_vector<float> vector1(vect_size);

  
  创建+填充向量:0.140s
  
  



int vect_size=1000000;
thrust::device_vector<float> vector1(vect_size,0.1);

  
  CPU向量赋值:0.141s



int vect_size=1000000;
thrust::host_vector<float> vector1(vect_size,0.1);
start_t=clock();
thrust::device_vector<float> vector2=vector1;
end_t=clock();

  填充向量:0.000s



int vect_size=1000000;
thrust::device_vector<float> vector(vect_size);
start_t=clock();
thrust::fill(vector.begin(),vector.end(),(float)0.1);
end_t=clock();

  向量点乘:0.016s



int vect_size=100000000;
thrust::device_vector<float> vector1(vect_size,(float)0.1);
thrust::device_vector<float> vector2(vect_size,(float)0.2);
thrust::device_vector<float> vector3(vect_size,(float)0.2);
start_t=clock();
thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>());
end_t=clock();

  向量相乘:0.000s



int vect_size=100000000;
thrust::device_vector<float> vector1(vect_size,(float)0.1);
thrust::device_vector<float> vector2(vect_size,(float)0.2);
thrust::device_vector<float> vector3(vect_size);
start_t=clock();
thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
end_t=clock();

  矩阵乘向量(实现1):0.530s



int matrix1_rownum=2000;
int matrix1_colnum=50000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::device_vector<float> vector1(matrix1_size,(float)0.1);
thrust::device_vector<float> vector2(matrix1_colnum,(float)0.2);
thrust::device_vector<float> tmp(matrix1_colnum);
thrust::device_vector<float> vector3(matrix1_rownum);
start_t=clock();
for(int row=0;row<matrix1_rownum;++row){
thrust::transform(vector1.begin()+row*matrix1_colnum,vector1.begin()+(row+1)*matrix1_colnum,vector2.begin(),tmp.begin(),thrust::multiplies<float>());
vector3[row]=thrust::reduce(tmp.begin(),tmp.end(),(float)0,thrust::multiplies<float>());
}
end_t=clock();

  矩阵乘向量(实现2)CUBLAS,待试
  矩阵乘矩阵CUBLAS,待试
  
  Python
  直接使用python的list实现上述功能实在太慢……而且由于无法指定float类型,其默认使用16位double类型来表示小数,使用10^8会超出list索引上限……故只使用10^7实验,速度差距可以自行换算。
  大致估算python的向量运算比c++慢50倍,矩阵运算慢1000。
  初始化向量并赋值:1.51s



vector_size=10000000
vector=[]
for i in range(vector_size):
vector.append(0.1)

  向量点乘:1.75s



vector_size=10000000
vector1=[]
for i in range(vector_size):
vector1.append(0.1)
vector2=[]
for i in range(vector_size):
vector2.append(0.1)
start_t=time.time()
sum=0
for i in range(vector_size):
sum+=vector1*vector2
end_t=time.time()

  向量相乘:2.39



vector_size=10000000
vector1=[]
for i in range(vector_size):
vector1.append(0.1)
vector2=[]
for i in range(vector_size):
vector2.append(0.1)
vector3=[]
for i in range(vector_size):
vector3.append(0.1)
start_t=time.time()
for i in range(vector_size):
vector3=vector1*vector2
end_t=time.time()

  矩阵乘向量:3.06s



matrix1_rownum=2000
matrix1_colnum=5000
matrix1_size=matrix1_rownum*matrix1_colnum
vector1=[]
for i in range(matrix1_size):
vector1.append(0.1)
vector2=[]
for i in range(matrix1_colnum):
vector2.append(0.1)
vector3=[]
for i in range(matrix1_rownum):
vector3.append(0.1)
start_t=time.time()
for row in range(matrix1_rownum):
for col in range(matrix1_colnum):
vector3[row]=vector1[row*matrix1_colnum+col]*vector2[col]
end_t=time.time()

  矩阵相乘:11.37s



matrix1_rownum=200
matrix1_colnum=500
matrix1_size=matrix1_rownum*matrix1_colnum
vector1=[]
for i in range(matrix1_size):
vector1.append(0.1)
matrix2_rownum=500
matrix2_colnum=200
matrix2_size=matrix2_rownum*matrix2_colnum
vector2=[]
for i in range(matrix2_size):
vector2.append(0.1)
matrix3_size=matrix1_rownum*matrix2_colnum
vector3=[]
for i in range(matrix3_size):
vector3.append(0.1)
start_t=time.time()
for row in range(matrix1_rownum):
for col in range(matrix2_colnum):
for i in range(matrix1_colnum):
vector3[row*matrix2_colnum+col]+=vector1[row*matrix1_colnum+i]*vector2[i*matrix2_colnum+col]
end_t=time.time()

  当然实际进行向量运算没人会拿python的list数据结构进行运算,这里只是好奇定量测一下list到底有多慢……
  Python numpy
  创建全0向量:0.0s



vector_size=100000000
vector=numpy.zeros(vector_size)

  创建+填充向量:0.25s



vector_size=100000000
vector=numpy.zeros(vector_size)
vector.fill(0.01)

  向量点乘:0.125s(由于python是32位……内存原因,数据规模减半)



vector_size=50000000
vector1=numpy.zeros(vector_size)
vector1.fill(0.01)
vector2=numpy.zeros(vector_size)
vector2.fill(0.02)
start_t=time.time()
sum=numpy.inner(vector1,vector2)
end_t=time.time()

  向量相乘:0.234s



vector_size=50000000
vector1=numpy.zeros(vector_size)
vector1.fill(0.01)
vector2=numpy.zeros(vector_size)
vector2.fill(0.02)
start_t=time.time()
vector3=numpy.multiply(vector1,vector2)
end_t=time.time()

  矩阵乘向量:0.094s



matrix1_rownum=2000
matrix1_colnum=50000
matrix1_size=matrix1_rownum*matrix1_colnum
vector1=numpy.zeros(matrix1_size)
vector1.fill(0.01)
vector2=numpy.zeros(matrix1_colnum)
vector2.fill(0.02)
start_t=time.time()
vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)
vector2=vector2.reshape(matrix1_colnum,1)
vector3=numpy.dot(vector1,vector2)
end_t=time.time()

  矩阵乘矩阵:23.16s(numpy.dot出乎意料的慢,使用numpy.matrix类时间为11.73s,依旧很慢而且占用更大内存,在创建matrix对象时也要0.4s)



matrix1_rownum=2000
matrix1_colnum=50000
matrix1_size=matrix1_rownum*matrix1_colnum
vector1=numpy.zeros(matrix1_size)
vector1.fill(0.01)
matrix2_rownum=50000
matrix2_colnum=1000
matrix2_size=matrix2_rownum*matrix2_colnum
vector2=numpy.zeros(matrix2_size)
vector2.fill(0.02)
start_t=time.time()
vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)
vector2=vector2.reshape(matrix2_rownum,matrix2_colnum)
vector3=numpy.dot(vector1,vector2)
end_t=time.time()

  

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-145033-1-1.html 上篇帖子: Python实现类似switch...case功能 下篇帖子: python xml.dom模块解析xml
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表