sina sina - 1 month ago 6
Scala Question

SparkSQL:Avg based on a column after GroupBy

I have an rdd of student grades and I need to first group them by the first column which is university and then show the average of student numbers in each course like this. What is the easiest way to do this query?

+----------+-------------------+
|university| avg of students |
+----------+--------------------+
| MIT| 3 |
| Cambridge| 2.66


Here is the dataset.

case class grade(university: String, courseId: Int, studentId: Int, grade: Double)

val grades = List(grade(
grade("Cambridge", 1, 1001, 4),
grade("Cambridge", 1, 1004, 4),
grade("Cambridge", 2, 1006, 3.5),
grade("Cambridge", 2, 1004, 3.5),
grade("Cambridge", 2, 1002, 3.5),
grade("Cambridge", 3, 1006, 3.5),
grade("Cambridge", 3, 1007, 5),
grade("Cambridge", 3, 1008, 4.5),
grade("MIT", 1, 1001, 4),
grade("MIT", 1, 1002, 4),
grade("MIT", 1, 1003, 4),
grade("MIT", 1, 1004, 4),
grade("MIT", 1, 1005, 3.5),
grade("MIT", 2, 1009, 2))

Answer

1) First groupBy university

2) then get course count per university

3) then groupBy courseId

4) then get student count per course

grades.groupBy(_.university).map { case (k, v) =>
    val courseCount = v.map(_.courseId).distinct.length
    val studentCountPerCourse = v.groupBy(_.courseId).map { case (k, v) => v.length }.sum
    k -> (studentCountPerCourse.toDouble / courseCount.toDouble)
  }

Scala REPL

scala>   val grades = List(
      grade("Cambridge", 1, 1001, 4),
      grade("Cambridge", 1, 1004, 4),
      grade("Cambridge", 2, 1006, 3.5),
      grade("Cambridge", 2, 1004, 3.5),
      grade("Cambridge", 2, 1002, 3.5),
      grade("Cambridge", 3, 1006, 3.5),
      grade("Cambridge", 3, 1007, 5),
      grade("Cambridge", 3, 1008, 4.5),
      grade("MIT", 1, 1001, 4),
      grade("MIT", 1, 1002, 4),
      grade("MIT", 1, 1003, 4),
      grade("MIT", 1, 1004, 4),
      grade("MIT", 1, 1005, 3.5),
      grade("MIT", 2, 1009, 2))
grades: List[grade] = List(
  grade("Cambridge", 1, 1001, 4.0),
  grade("Cambridge", 1, 1004, 4.0),
  grade("Cambridge", 2, 1006, 3.5),
  grade("Cambridge", 2, 1004, 3.5),
  grade("Cambridge", 2, 1002, 3.5),
  grade("Cambridge", 3, 1006, 3.5),
  grade("Cambridge", 3, 1007, 5.0),
  grade("Cambridge", 3, 1008, 4.5),
  grade("MIT", 1, 1001, 4.0),
  grade("MIT", 1, 1002, 4.0),
  grade("MIT", 1, 1003, 4.0),
  grade("MIT", 1, 1004, 4.0),
  grade("MIT", 1, 1005, 3.5),
  grade("MIT", 2, 1009, 2.0)
)

scala>   grades.groupBy(_.university).map { case (k, v) =>
      val courseCount = v.map(_.courseId).distinct.length
      val studentCountPerCourse = v.groupBy(_.courseId).map { case (k, v) => v.length }.sum
      k -> (studentCountPerCourse.toDouble / courseCount.toDouble)
    }
res2: Map[String, Double] = Map("MIT" -> 3.0, "Cambridge" -> 2.6666666666666665)