Bhavesh Bhavesh - 1 month ago 8
Scala Question

hadoop distcp doesnot create folder when we pass single file

I am facing below issues in hadoop Distcp any suggestion or help is highly appreciated.

I am trying to copy data from Google Cloud platform to Amazon S3

1) When we have multiple files to copy from source to destination (This work fine)

val sourcefile : String = "gs://XXXX_-abc_account2621/abc_account2621_click_20170616*.csv.gz [Multiple files to copy (we have * in the file name)]

Output: S3://S3bucketname/xxx/xxxx/clientid=account2621/date=2017-08-18/
Files in above path
abc_account2621_click_2017061612_20170617_005852_572560033.csv.gz
abc_account2621_click_2017061616_20170617_045654_572608350.csv.gz
abc_account2621_click_2017061622_20170617_103107_572684922.csv.gz
abc_account2621_click_2017061623_20170617_120235_572705834.csv.gz


2) When we have only one file to copy from source to destination (Issue)

val sourcefile : String = "gs://XXXX_-abc_account2621/abc_account2621_activity_20170618_20170619_034412_573362513.csv.gz

Output:S3://S3bucketname/xxx/xxxx/clientid=account2621/
Files in above path
date=2017-08-18 (Directory replace with file content and it doesn't have file type)


Code:

def main(args: Array[String]): Unit = {

val Array(environment,customer, typesoftables, clientid, filedate) = args.take(5)

val S3Path: String = customer + "/" + typesoftables + "/" + "clientid=" + clientid + "/" + "date=" + filedate + "/"

val sourcefile : String = "gs://XXXX_-abc_account2621//abc_account2621_activity_20170618_20170619_034412_573362513.csv.gz"

val destination: String = "s3n://S3bucketname/" + S3Path

println(sourcefile)
println(destination)

val filepaths: Array[String] = Array(sourcefile, destination)

executeDistCp(filepaths)

}

def executeDistCp(filepaths : Array[String]) {
val conf: Configuration = new Configuration()

conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl","com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
conf.set("google.cloud.auth.service.account.enable", "true")
conf.set("fs.gs.project.id", "XXXX-XXXX")
conf.set("google.cloud.auth.service.account.json.keyfile","/tmp/XXXXX.json")
conf.set("fs.s3n.awsAccessKeyId", "XXXXXXXXXXXX")
conf.set("fs.s3n.awsSecretAccessKey","XXXXXXXXXXXXXX")
conf.set("mapreduce.application.classpath","$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*
,/usr/lib/hadoop-lzo/lib/*,/usr/share/aws/emr/emrfs/conf,/usr/share/aws/emr/emrfs/lib/*,/usr/share/aws/emr/emrfs/auxlib/*,/usr/share/aws/emr/lib/*,/usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,/usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,/usr/share/aws/emr/cloudwatch-sink/lib/*,/usr/share/aws/aws-java-sdk/*,/tmp/gcs-connector-latest-hadoop2.jar")
conf.set("HADOOP_CLASSPATH","$HADOOP_CLASSPATH:/tmp/gcs-connector-latest-hadoop2.jar")


val outputDir: Path = new Path(filepaths(1))
outputDir.getFileSystem(conf).delete(outputDir, true)

val distCp: DistCp = new DistCp(conf,null)
ToolRunner.run(distCp, filepaths)

}
}

Answer Source

By adding below code the above issue is fixed

Code

 val makeDir: Path = new Path(filepaths(1))
 makeDir.getFileSystem(conf).mkdirs(makeDir)