deci 发表于 2015-8-2 12:18:33

Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity

  协同过滤源码路径:
~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2
main/java/org/apache/mahout/cf/taste/
├── common
│   ├── NoSuchItemException.java
│   ├── NoSuchUserException.java
│   ├── Refreshable.java
│   ├── TasteException.java
│   └── Weighting.java
├── eval
│   ├── DataModelBuilder.java
│   ├── IRStatistics.java
│   ├── RecommenderBuilder.java
│   ├── RecommenderEvaluator.java
│   ├── RecommenderIRStatsEvaluator.java
│   └── RelevantItemsDataSplitter.java
├── hadoop
│   ├── EntityEntityWritable.java
│   ├── EntityPrefWritable.java
│   ├── MutableRecommendedItem.java
│   ├── RecommendedItemsWritable.java
│   ├── TasteHadoopUtils.java
│   ├── ToEntityPrefsMapper.java
│   ├── ToItemPrefsMapper.java
│   ├── TopItemsQueue.java
│   ├── als
│   ├── item
│   ├── preparation
│   └── similarity
├── impl
│   ├── common
│   ├── eval
│   ├── model
│   ├── neighborhood
│   ├── recommender
│   └── similarity
├── model
│   ├── DataModel.java
│   ├── IDMigrator.java
│   ├── JDBCDataModel.java
│   ├── Preference.java
│   ├── PreferenceArray.java
│   └── UpdatableIDMigrator.java
├── neighborhood
│   └── UserNeighborhood.java
├── recommender
│   ├── CandidateItemsStrategy.java
│   ├── IDRescorer.java
│   ├── ItemBasedRecommender.java
│   ├── MostSimilarItemsCandidateItemsStrategy.java
│   ├── RecommendedItem.java
│   ├── Recommender.java
│   ├── Rescorer.java
│   └── UserBasedRecommender.java
└── similarity
    ├── ItemSimilarity.java
    ├── PreferenceInferrer.java
    ├── UserSimilarity.java
    └── precompute

similarity相似度的interface定义
recommender 推荐算法的interface定义
model数据model类型的interface定义

impl 目录 则是以上interface定义的实现

PearsonCorrelationSimilarity的实现在
  ~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java



/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
//这里CenterData传的时true
/* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。
* 在 AbstractSimilarity里的实现代码如下:
*double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
}
*/
super(dataModel, weighting, true);
Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
}
@Override
double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
if (n == 0) {
return Double.NaN;
}
// Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
// the data is assumed to be centered.
double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
if (denominator == 0.0) {
// One or both parties has -all- the same ratings;
// can't really say much similarity under this measure
return Double.NaN;
}
return sumXY / denominator;
}
  就是数学公式的实现:
  

  
  具体的累加,在interface里面已经做了,:



@Override
public double userSimilarity(long userID1, long userID2) throws TasteException {
DataModel dataModel = getDataModel();
  //获取用户偏好
PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
int xLength = xPrefs.length();
int yLength = yPrefs.length();
if (xLength == 0 || yLength == 0) {
return Double.NaN;
}
long xIndex = xPrefs.getItemID(0);
long yIndex = yPrefs.getItemID(0);
int xPrefIndex = 0;
int yPrefIndex = 0;
double sumX = 0.0;
double sumX2 = 0.0;
double sumY = 0.0;
double sumY2 = 0.0;
double sumXY = 0.0;
double sumXYdiff2 = 0.0;
int count = 0;
boolean hasInferrer = inferrer != null;
while (true) {
int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
if (hasInferrer || compare == 0) {
double x;
double y;
if (xIndex == yIndex) {
// Both users expressed a preference for the item
x = xPrefs.getValue(xPrefIndex);
y = yPrefs.getValue(yPrefIndex);
} else {
//如果不存在对应的分数,则进行推断...
// Only one user expressed a preference, but infer the other one's preference and tally
// as if the other user expressed that preference
if (compare < 0) {
// X has a value; infer Y's
x = xPrefs.getValue(xPrefIndex);
y = inferrer.inferPreference(userID2, xIndex);
} else {
// compare > 0
// Y has a value; infer X's
x = inferrer.inferPreference(userID1, yIndex);
y = yPrefs.getValue(yPrefIndex);
}
}
sumXY += x * y;
sumX += x;
sumX2 += x * x;
sumY += y;
sumY2 += y * y;
double diff = x - y;
sumXYdiff2 += diff * diff;
count++;
}
if (compare = xLength) {
if (hasInferrer) {
// Must count other Ys; pretend next X is far away
if (yIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
xIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
xIndex = xPrefs.getItemID(xPrefIndex);
}
}
if (compare >= 0) {
if (++yPrefIndex >= yLength) {
if (hasInferrer) {
// Must count other Xs; pretend next Y is far away
if (xIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
yIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
yIndex = yPrefs.getItemID(yPrefIndex);
}
}
}
// "Center" the data. If my math is correct, this'll do it.
double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
}
if (!Double.isNaN(result)) {
result = normalizeWeightResult(result, count, cachedNumItems);
}
return result;
}
  
  参考:
  http://blog.iyunv.com/v_july_v/article/details/7184318
  http://blog.sina.com.cn/s/blog_73de143c010153vp.html
  
页: [1]
查看完整版本: Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity