[原]Spark MLlib系列(二):基于协同过滤的电影推荐系统
- -随着大数据时代的到来,数据当中挖取金子的工作越来越有吸引力. 利用Spark在内存迭代运算、机器学习领域强悍性能的优势,使用spark处理数据挖掘问题就显得很有实际价值. 这篇文章给大家分享一个spark MLlib 的推荐实战例子. 我将会分享怎样用spark MLlib做一个电影评分的推荐系统.
基于矩阵分解的协同过滤的标准方法一般将用户商品矩阵中的元素作为用户对商品的显性偏好。
在许多的现实生活中的很多场景中,我们常常只能接触到隐性的反馈(例如游览,点击,购买,喜欢,分享等等)在 MLlib 中所用到的处理这种数据的方法来源于文献: Collaborative Filtering for Implicit Feedback Datasets。 本质上,这个方法将数据作为二元偏好值和偏好强度的一个结合,而不是对评分矩阵直接进行建模。因此,评价就不是与用户对商品的显性评分而是和所观察到的用户偏好强度关联了起来。然后,这个模型将尝试找到隐语义因子来预估一个用户对一个商品的偏好。
目前可用的协同过滤的算法:
package com.ml.recommender import org.apache.spark.SparkContext._ import org.apache.spark.SparkConf import org.apache.spark.mllib.recommendation._ import org.apache.spark.rdd.{ PairRDDFunctions, RDD } import org.apache.spark.SparkContext import scala.collection.mutable.HashMap import java.util.List import java.util.ArrayList import scopt.OptionParser import com.ml.util.HbaseUtil /** * moivelens 电影推荐 * */ object MoiveRecommender { val numRecommender = 10 case class Params( input: String = null, numIterations: Int = 20, lambda: Double = 1.0, rank: Int = 10, numUserBlocks: Int = -1, numProductBlocks: Int = -1, implicitPrefs: Boolean = false, userDataInput: String = null) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("MoiveRecommender") { head("MoiveRecommender: an example app for ALS on MovieLens data.") opt[Int]("rank") .text(s"rank, default: ${defaultParams.rank}}") .action((x, c) => c.copy(rank = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) opt[Int]("numUserBlocks") .text(s"number of user blocks, default: ${defaultParams.numUserBlocks} (auto)") .action((x, c) => c.copy(numUserBlocks = x)) opt[Int]("numProductBlocks") .text(s"number of product blocks, default: ${defaultParams.numProductBlocks} (auto)") .action((x, c) => c.copy(numProductBlocks = x)) opt[Unit]("implicitPrefs") .text("use implicit preference") .action((_, c) => c.copy(implicitPrefs = true)) opt[String]("userDataInput") .required() .text("use data input path") .action((x, c) => c.copy(userDataInput = x)) arg[String]("<input>") .required() .text("input paths to a MovieLens dataset of ratings") .action((x, c) => c.copy(input = x)) note( """ |For example, the following command runs this app on a synthetic dataset: | | bin/spark-submit --class com.zachary.ml.MoiveRecommender \ | examples/target/scala-*/spark-examples-*.jar \ | --rank 5 --numIterations 20 --lambda 1.0 \ | data/mllib/u.data """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { System.exit(1) } } def run(params: Params) { //本地运行模式,读取本地的spark主目录 var conf = new SparkConf().setAppName("Moive Recommendation") .setSparkHome("D:\\work\\hadoop_lib\\spark-1.1.0-bin-hadoop2.4\\spark-1.1.0-bin-hadoop2.4") conf.setMaster("local[*]") //集群运行模式,读取spark集群的环境变量 //var conf = new SparkConf().setAppName("Moive Recommendation") val context = new SparkContext(conf) //加载数据 val data = context.textFile(params.input) /** * *MovieLens ratings are on a scale of 1-5: * 5: Must see * 4: Will enjoy * 3: It's okay * 2: Fairly bad * 1: Awful */ val ratings = data.map(_.split("\t") match { case Array(user, item, rate, time) => Rating(user.toInt, item.toInt, rate.toDouble) }) //使用ALS建立推荐模型 //也可以使用简单模式 val model = ALS.train(ratings, ranking, numIterations) val model = new ALS() .setRank(params.rank) .setIterations(params.numIterations) .setLambda(params.lambda) .setImplicitPrefs(params.implicitPrefs) .setUserBlocks(params.numUserBlocks) .setProductBlocks(params.numProductBlocks) .run(ratings) predictMoive(params, context, model) evaluateMode(ratings, model) //clean up context.stop() } /** * 模型评估 */ private def evaluateMode(ratings: RDD[Rating], model: MatrixFactorizationModel) { //使用训练数据训练模型 val usersProducets = ratings.map(r => r match { case Rating(user, product, rate) => (user, product) }) //预测数据 val predictions = model.predict(usersProducets).map(u => u match { case Rating(user, product, rate) => ((user, product), rate) }) //将真实分数与预测分数进行合并 val ratesAndPreds = ratings.map(r => r match { case Rating(user, product, rate) => ((user, product), rate) }).join(predictions) //计算均方差 val MSE = ratesAndPreds.map(r => r match { case ((user, product), (r1, r2)) => var err = (r1 - r2) err * err }).mean() //打印出均方差值 println("Mean Squared Error = " + MSE) } /** * 预测数据并保存到HBase中 */ private def predictMoive(params: Params, context: SparkContext, model: MatrixFactorizationModel) { var recommenders = new ArrayList[java.util.Map[String, String]](); //读取需要进行电影推荐的用户数据 val userData = context.textFile(params.userDataInput) userData.map(_.split("\\|") match { case Array(id, age, sex, job, x) => (id) }).collect().foreach(id => { //为用户推荐电影 var rs = model.recommendProducts(id.toInt, numRecommender) var value = "" var key = 0 //保存推荐数据到hbase中 rs.foreach(r => { key = r.user value = value + r.product + ":" + r.rating + "," }) //成功,则封装put对象,等待插入到Hbase中 if (!value.equals("")) { var put = new java.util.HashMap[String, String]() put.put("rowKey", key.toString) put.put("t:info", value) recommenders.add(put) } }) //保存到到HBase的[recommender]表中 //recommenders是返回的java的ArrayList,可以自己用Java或者Scala写HBase的操作工具类,这里我就不给出具体的代码了,应该可以很快的写出 HbaseUtil.saveListMap("recommender", recommenders) } }