J.Done J.Done - 18 days ago 8
Scala Question

Spark - Make dataframe with multi column csv

origin.csv
no,key1,key2,key3,key4,key5,...
1,A1,B1,C1,D1,E1,..
2,A2,B2,C2,D2,E2,..
3,A3,B3,C3,D3,E3,..


WhatIwant.csv
1,A1,key1
1,B1,key2
1,C1,key3
...
3,A3,key1
3,B3,key2
...


I loaded csv with read method(origin.csv dataframe), but unable to convert it.

val df = spark.read
.option("header", true)
.option("charset", "euc-kr")
.csv(csvFilePath)


Any idea of this?

Answer

Try this.

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

val df = Seq((1,"A1","B1","C1","D1"), (2,"A2","B2","C2","D2"), (3,"A3","B3","C3","D2")).toDF("no", "key1", "key2","key3","key4")
df.show

def myUDF(df: DataFrame, by: Seq[String]): DataFrame = {
  val (columns, types) = df.dtypes.filter{ case (clm, _) => !by.contains(clm)}.unzip
  require(types.distinct.size == 1)      
  val keys = explode(array(
    columns.map(clm => struct(lit(clm).alias("key"),col(clm).alias("val"))): _*
  ))
  val byValue = by.map(col(_))
  df.select(byValue :+ keys.alias("_key"): _*).select(byValue ++ Seq($"_key.val", $"_key.key"): _*)

}

val df1 = myUDF(df, Seq("no"))
df1.show
Comments