Extract first line of CSV file in Pig - csv

I have several CSV files and the header is always the first line in the file. What's the best way to get that line out of the CSV file as a string in Pig? Preprocessing with sed, awk etc is not an option.
I've tried loading the file with regular PigStorage and the Piggy bank CsvLoader, but its not clear to me how I can get that first line, if at all.
I'm open to writing an UDF, if that's what it takes.

Disclaimer: I'm not great with Java.
You are going to need a UDF. I'm not sure exactly what you are asking for, but this UDF will take a series of CSV files and turn them into maps, where the keys are the values at the top of the file. This should hopefully be enough of a skeleton so that you can change it into what you want.
The couple of tests I've done remotely and locally indicate that this will work.
package myudfs;
import java.io.IOException;
import org.apache.pig.LoadFunc;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
public class ExampleCSVLoader extends LoadFunc {
protected RecordReader in = null;
private String fieldDel = "" + '\t';
private Map<String, String> outputMap = null;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
// This stores the fields that are defined in the first line of the file
private ArrayList<Object> topfields = null;
public ExampleCSVLoader() {}
public ExampleCSVLoader(String delimiter) {
this();
this.fieldDel = delimiter;
}
#Override
public Tuple getNext() throws IOException {
try {
boolean notDone = in.nextKeyValue();
if (!notDone) {
outputMap = null;
topfields = null;
return null;
}
String value = in.getCurrentValue().toString();
String[] values = value.split(fieldDel);
Tuple t = mTupleFactory.newTuple(1);
ArrayList<Object> tf = new ArrayList<Object>();
int pos = 0;
for (int i = 0; i < values.length; i++) {
if (topfields == null) {
tf.add(values[i]);
} else {
readField(values[i], pos);
pos = pos + 1;
}
}
if (topfields == null) {
topfields = tf;
t = mTupleFactory.newTuple();
} else {
t.set(0, outputMap);
}
outputMap = null;
return t;
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
}
// Applies foo to the appropriate value in topfields
private void readField(String foo, int pos) {
if (outputMap == null) {
outputMap = new HashMap<String, String>();
}
outputMap.put((String) topfields.get(pos), foo);
}
#Override
public InputFormat getInputFormat() {
return new TextInputFormat();
}
#Override
public void prepareToRead(RecordReader reader, PigSplit split) {
in = reader;
}
#Override
public void setLocation(String location, Job job)
throws IOException {
FileInputFormat.setInputPaths(job, location);
}
}
Sample output loading a directory with:
csv1.in csv2.in
------- ---------
A|B|C D|E|F
Hello|This|is PLEASE|WORK|FOO
FOO|BAR|BING OR|EVERYTHING|WILL
BANG|BOSH BE|FOR|NAUGHT
Produces this output:
A: {M: map[]}
()
([D#PLEASE,E#WORK,F#FOO])
([D#OR,E#EVERYTHING,F#WILL])
([D#BE,E#FOR,F#NAUGHT])
()
([A#Hello,B#This,C#is])
([A#FOO,B#BAR,C#BING])
([A#BANG,B#BOSH])
The ()s are the top lines of the file. getNext() requires that we return something, otherwise the file will stop being processed. Therefore they return a null schema.

If your CSV comply with CSV conventions of Excel 2007 you can use already available loader from Piggybank http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java?view=markup
It has an option to skip the CSV header SKIP_INPUT_HEADER

Related

JUnit - Parameterized Test - Stopping compiling

Code that I writed below stopping compliling before contructor or #Before (depend of hiding). There is no errors and It can't run even one time.
I did it with tutorial:
https://www.tutorialspoint.com/junit/junit_parameterized_test.htm
Can somebody have idea what is wrong with this code?
import static org.junit.Assert.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Scanner;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
#RunWith(Parameterized.class)
public class ParametryzowaneTestyKarty {
private ArrayList<Karta> talia;
private String wynik;
private karty kartyy;
#Before
public void initialize() {
kartyy = new karty();
}
public ParametryzowaneTestyKarty(ArrayList<Karta> talia, String wynik) {
this.talia = talia;
this.wynik = wynik;
}
#Parameterized.Parameters
public static Collection wyniki() throws FileNotFoundException {
File plik22 = new File("...");
Scanner test = new Scanner(plik22);
while(test.hasNextLine()) {
ArrayList<Karta> talia = new ArrayList<>();
String wiersz = test.nextLine();
String[] parts = wiersz.split(",");
for(int i=0;i<10;i+=2) {
String part0 = parts[i];
String part1 = parts[i+1];
int kol=Integer.parseInt(part0);
int fig=Integer.parseInt(part1);
Kolor[] k = Kolor.values();
Kolor ko=k[kol];
Figura[] f = Figura.values();
Figura fi = f[fig];
talia.add(new Karta(ko, fi));
String w = parts[10];
Arrays.asList(new Object[][] {
{ talia, w },
});
}
}
return Arrays.asList();
}
#Test
public void TestParametryzowaneKarty() {
System.out.println("1");
System.out.println("Karty : " + talia);
assertEquals(wynik,
karty.check(talia));
}
}
It would help to know the exact error you are getting.
There are some issues with your code, as Arrays.asList() doesn't do what you are expecting, and as thus the method public static Collection wyniki() is returning a empty list.
The following code might fix the issue, but I doubt it as the talia list is reused for each row in the file that is being read.
#Parameterized.Parameters
public static Collection wyniki() throws FileNotFoundException {
File plik22 = new File("...");
Scanner test = new Scanner(plik22);
while(test.hasNextLine()) {
ArrayList<Karta> talia = new ArrayList<>();
ArrayList<Object[]> rows = new ArrayList<>();
String wiersz = test.nextLine();
String[] parts = wiersz.split(",");
for(int i=0;i<10;i+=2) {
String part0 = parts[i];
String part1 = parts[i+1];
int kol=Integer.parseInt(part0);
int fig=Integer.parseInt(part1);
Kolor[] k = Kolor.values();
Kolor ko=k[kol];
Figura[] f = Figura.values();
Figura fi = f[fig];
talia.add(new Karta(ko, fi));
String w = parts[10];
// new code
rows.add(new Object[]{talia, w} );
}
}
return rows;
}

MapReduce Function with JSON Files and JSONParser

i have some problems during writing my mapreduce funtions.
I want to solve the following problem:
I have a JSON file with 1mio JSONObject like this:
{"_id":3951,"title":"Two Family House (2000)","genres":["Drama"],"ratings":[{"userId":173,"rating":5},{"userId":195,"rating":5},{"userId":411,"rating":4},{"userId":593,"rating":2},{"userId":629,"rating":3},{"userId":830,"rating":3},{"userId":838,"rating":5},{"userId":850,"rating":4},{"userId":856,"rating":4},{"userId":862,"rating":5},{"userId":889,"rating":1},{"userId":928,"rating":5},{"userId":986,"rating":4},{"userId":1001,"rating":5},{"userId":1069,"rating":3},{"userId":1168,"rating":3},{"userId":1173,"rating":2},{"userId":1242,"rating":3},{"userId":1266,"rating":5},{"userId":1331,"rating":5},{"userId":1417,"rating":5},{"userId":1470,"rating":4},{"userId":1474,"rating":5},{"userId":1615,"rating":3},{"userId":1625,"rating":4},{"userId":1733,"rating":4},{"userId":1799,"rating":4},{"userId":1865,"rating":5},{"userId":1877,"rating":5},{"userId":1897,"rating":5},{"userId":1946,"rating":4},{"userId":2031,"rating":4},{"userId":2129,"rating":2},{"userId":2353,"rating":4},{"userId":2986,"rating":4},{"userId":3940,"rating":4},{"userId":3985,"rating":3},{"userId":4025,"rating":5},{"userId":4727,"rating":3},{"userId":5333,"rating":3}]}
and more....
One JSON Object is a Movie, which contains a array ratings. I want to count all ratings in the JSON File.
I created a Maven Proct in IntelliJ with the dependencys for Hadoop and JSON Parser. My MapReduce Class is this:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.IOException;
import java.util.Iterator;
public class RatingCounter {
public static class RatingMapper extends Mapper<JSONObject, Text, Text, Text>{
private Text id = new Text();
private Text ratingAnzahl = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{
JSONParser parser = new JSONParser();
try {
Object obj = parser.parse(value.toString());
JSONObject jsonObject = (JSONObject) obj;
String movieId = (String) jsonObject.get("_id");
int count = 0;
// loop array
JSONArray ratings = (JSONArray) jsonObject.get("ratings");
Iterator<String> iterator = ratings.iterator();
while (iterator.hasNext()) {
count++;
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}
public static class RatingReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Text resultValue = new Text();
int allRatings = 0;
while (values.hasNext()){
allRatings += Integer.parseInt(values.toString());
}
resultValue.set(""+allRatings);
context.write(key, resultValue);
}
}
public static void main (String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "ratings count");
job.setJarByClass(RatingCounter.class);
job.setMapperClass(RatingMapper.class);
job.setReducerClass(RatingReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I have no idea, how I can write the functions in Mapper and Reducer. Can someone help me pls?
I've made a few changes to your mapper and reducer.
First, for your mapper, you are not writing the output anywhere and your syntax while extending the Mapper class is also wrong(arguably). The first input to any mapper is a LongWritable (or Object type) offset of line. You can notice the changes below
public static class RatingMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
public void map(LongWritable key, Text value, Context context) throws IOException, ParseException{
JSONParser parser = new JSONParser();
Object obj = parser.parse(value.toString());
JSONObject jsonObject = (JSONObject) obj;
String movieId = (String) jsonObject.get("_id");
JSONArray ratings = (JSONArray) jsonObject.get("ratings");
context.write(new Text(movieId), new IntWritable(ratings.size()) );
}
}
Notice here, the output of map is written using context.write
Now, coming onto your Reducer some things will change because of the changes I made in the mapper. Also, since your Number of Ratings will always be an integer, you don't need to convert it to Text, use parseInt and then convert to Text again.
public static class RatingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int allRatings = 0;
while (values.hasNext()){
allRatings += value.get();
}
context.write(key, new IntWritable(resultValue));
}
}

MySQL ResultSet into Gson array

I want to put a MySQL result set into a JsonArray using Gsons library. How can I best achieve this. I've read this:
resultset to json using gson
But it uses for some reason, the simple-Json library in addition. i dont want that if possible. Is there any way to achieve this easily with the gson library?
Thank you very much!
PlayerList.java:
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package de.freakyonline.ucone;
import de.freakyonline.ucone.Player;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.Socket;
import java.util.ArrayList;
import java.util.Iterator;
import javafx.application.Platform;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
import javafx.scene.control.TextArea;
/**
*
* #author uwe
*/
public class PlayerList {
ObservableList<Player> playerList;
ObjectInputStream in;
ObjectOutputStream out;
Socket sock;
private Object obj = null;
private Object obj2 = null;
TextArea consoleOneTextArea;
public PlayerList(ObjectInputStream in, ObjectOutputStream out, Socket sock, TextArea consoleOneTextArea) {
this.in = in;
this.out = out;
this.sock = sock;
this.consoleOneTextArea = consoleOneTextArea;
getPlayersFromServer();
}
private void getPlayersFromServer() {
/* try {
out.writeObject("getplayers");
obj=in.readObject();
if(obj == null) {
System.out.println("ERROR! void getPlayersFromServer in PlayerList.java");
Platform.exit();
}
String command = obj.toString().toLowerCase();
String currentFromServer;
if(command.equalsIgnoreCase("getplayers")) {
while((obj2=in.readObject()) != null) {
currentFromServer = obj2.toString().toLowerCase();
for(String cell : currentFromServer.split(" ")) {
System.out.println(cell.toString());
}
if (currentFromServer.equalsIgnoreCase("done")) {
consoleOneTextArea.appendText("This is finished. Have fun!\n");
break;
}
consoleOneTextArea.appendText(currentFromServer + "\n");
}
} { System.out.println("ERROR"); }
} catch (Exception ex) { ex.printStackTrace(); }
*/
this.playerList = FXCollections.observableArrayList(
new Player("freakyy85","Owner","1810",31,"m", "missing..."),
new Player("Ky3ak","Owner","1920",34,"m", "missing...")
);
}
}
(ive commented out some parts, as they are not relevant anymore)
Player.java:
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package de.freakyonline.ucone;
import com.google.gson.stream.JsonReader;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.Socket;
import javafx.scene.control.TextArea;
/**
*
* #author uwe
*/
public class Remote implements Runnable {
private Object obj = null;
private Object obj2 = null;
private ObjectInputStream in;
private ObjectOutputStream out;
private Socket sock;
private TextArea consoleOneTextArea;
public Remote (ObjectInputStream in, ObjectOutputStream out, Socket sock) {
this.in = in;
this.out = out;
this.sock = sock;
}
public ObjectInputStream getIn() {
return in;
}
public ObjectOutputStream getOut() {
return out;
}
public Socket getSock() {
return sock;
}
public void setConsoleOneTextArea(TextArea consoleOneTextArea) {
this.consoleOneTextArea = consoleOneTextArea;
}
public void run() {
try {
while((obj=in.readObject()) != null && sock.isConnected()) {
String command = obj.toString().toLowerCase();
String currentFromServer;
switch(command) {
case "getplayers":
/* while((obj2=in.readObject()) != null) {
currentFromServer = obj2.toString().toLowerCase();
if (currentFromServer.equalsIgnoreCase("done")) {
consoleOneTextArea.appendText("This is finished. Have fun!\n");
break;
}
consoleOneTextArea.appendText(currentFromServer + "\n");
*/ }
JsonReader jsonReader = new JsonReader(new InputStreamReader(in, "UTF-8"));
jsonReader.close();
break;
}
} catch (Exception ex) { ex.printStackTrace(); }
}
}
Is there any way to achieve this easily with the gson library?
Not really. Gson and JDBC are too/two unrelated things so you have to implement a custom remapping function to "decode" JDBC result set rows/fields and "encode" them back to JSON array/object respectively. Accumulating a JsonArray instance may be expensive from the memory consumption point of view, or even crash the application with OutOfMemoryError for huge result sets. Nonetheless they are good if the result sets are known to be small or LIMITed.
Accumulating JsonArray
static JsonArray resultSetToJsonArray(final ResultSet resultSet)
throws SQLException {
final ResultSetMetaData metaData = resultSet.getMetaData();
// JsonArray is a Gson built-in class to hold JSON arrays
final JsonArray jsonArray = new JsonArray();
while ( resultSet.next() ) {
jsonArray.add(resultSetRowToJsonObject(resultSet, metaData));
}
return jsonArray;
}
private static JsonElement resultSetRowToJsonObject(final ResultSet resultSet, final ResultSetMetaData metaData)
throws SQLException {
final int columnCount = metaData.getColumnCount();
// Every result set row is a JsonObject equivalent
final JsonObject jsonObject = new JsonObject();
// JDBC uses 1-based loops
for ( int i = 1; i <= columnCount; i++ ) {
jsonObject.add(metaData.getColumnName(i), fieldToJsonElement(resultSet, metaData, i));
}
return jsonObject;
}
private static JsonElement fieldToJsonElement(final ResultSet resultSet, final ResultSetMetaData metaData, final int column)
throws SQLException {
final int columnType = metaData.getColumnType(column);
final Optional<JsonElement> jsonElement;
// Process each SQL type mapping a value to a JSON tree equivalent
switch ( columnType ) {
case Types.BIT:
case Types.TINYINT:
case Types.SMALLINT:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
case Types.INTEGER:
// resultSet.getInt() returns 0 in case of null, so it must be extracted with getObject and cast, then converted to a JsonPrimitive
jsonElement = Optional.ofNullable((Integer) resultSet.getObject(column)).map(JsonPrimitive::new);
break;
case Types.BIGINT:
case Types.FLOAT:
case Types.REAL:
case Types.DOUBLE:
case Types.NUMERIC:
case Types.DECIMAL:
case Types.CHAR:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
case Types.VARCHAR:
jsonElement = Optional.ofNullable(resultSet.getString(column)).map(JsonPrimitive::new);
break;
case Types.LONGVARCHAR:
case Types.DATE:
case Types.TIME:
case Types.TIMESTAMP:
case Types.BINARY:
case Types.VARBINARY:
case Types.LONGVARBINARY:
case Types.NULL:
case Types.OTHER:
case Types.JAVA_OBJECT:
case Types.DISTINCT:
case Types.STRUCT:
case Types.ARRAY:
case Types.BLOB:
case Types.CLOB:
case Types.REF:
case Types.DATALINK:
case Types.BOOLEAN:
case Types.ROWID:
case Types.NCHAR:
case Types.NVARCHAR:
case Types.LONGNVARCHAR:
case Types.NCLOB:
case Types.SQLXML:
case Types.REF_CURSOR:
case Types.TIME_WITH_TIMEZONE:
case Types.TIMESTAMP_WITH_TIMEZONE:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
default:
throw new UnsupportedOperationException("Unknown type: " + columnType);
}
// If the optional value is missing, assume it's a null
return jsonElement.orElse(JsonNull.INSTANCE);
}
final JsonArray jsonArray = resultSetToJsonArray(resultSet);
System.out.println(jsonArray);
Don't forget to close the resultSet, of course.
JSON streaming
If the JsonArray is supposed to be written elsewhere, JsonWriter can be a better solution being able to process huge result sets reading row by row and writing JSON element by JSON element.
#SuppressWarnings("resource")
static void resultSetToJsonArrayStream(final ResultSet resultSet, final JsonWriter jsonWriter)
throws SQLException, IOException {
// Write the [ token
jsonWriter.beginArray();
final ResultSetMetaData metaData = resultSet.getMetaData();
while ( resultSet.next() ) {
// Write row by row
writeRow(resultSet, jsonWriter, metaData);
}
// Finish the array with the ] token
jsonWriter.endArray();
}
#SuppressWarnings("resource")
private static void writeRow(final ResultSet resultSet, final JsonWriter jsonWriter, final ResultSetMetaData metaData)
throws SQLException, IOException {
final int columnCount = metaData.getColumnCount();
// Similarly to the outer array: the { token starts a new object representing a row
jsonWriter.beginObject();
for ( int i = 1; i <= columnCount; i++ ) {
// Write the column name and try to resolve a JSON literal to be written
jsonWriter.name(metaData.getColumnName(i));
writeField(resultSet, jsonWriter, metaData, i);
}
// Terminate the object with }
jsonWriter.endObject();
}
#SuppressWarnings("resource")
private static void writeField(final ResultSet resultSet, final JsonWriter jsonWriter, final ResultSetMetaData metaData, final int column)
throws SQLException, IOException {
final int columnType = metaData.getColumnType(column);
switch ( columnType ) {
case Types.BIT:
case Types.TINYINT:
case Types.SMALLINT:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
case Types.INTEGER:
jsonWriter.value((Integer) resultSet.getObject(column));
break;
case Types.BIGINT:
case Types.FLOAT:
case Types.REAL:
case Types.DOUBLE:
case Types.NUMERIC:
case Types.DECIMAL:
case Types.CHAR:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
case Types.VARCHAR:
jsonWriter.value((String) resultSet.getObject(column));
break;
case Types.LONGVARCHAR:
case Types.DATE:
case Types.TIME:
case Types.TIMESTAMP:
case Types.BINARY:
case Types.VARBINARY:
case Types.LONGVARBINARY:
case Types.NULL:
case Types.OTHER:
case Types.JAVA_OBJECT:
case Types.DISTINCT:
case Types.STRUCT:
case Types.ARRAY:
case Types.BLOB:
case Types.CLOB:
case Types.REF:
case Types.DATALINK:
case Types.BOOLEAN:
case Types.ROWID:
case Types.NCHAR:
case Types.NVARCHAR:
case Types.LONGNVARCHAR:
case Types.NCLOB:
case Types.SQLXML:
case Types.REF_CURSOR:
case Types.TIME_WITH_TIMEZONE:
case Types.TIMESTAMP_WITH_TIMEZONE:
throw new UnsupportedOperationException("TODO: " + JDBCType.valueOf(columnType));
default:
throw new UnsupportedOperationException("Unknown type: " + columnType);
}
}
Example of writing to System.out, but, of course, it can be written anywhere just supplying an appropriate OutputStream instance:
final JsonWriter jsonWriter = new JsonWriter(new OutputStreamWriter(System.out))
resultSetToJsonArrayStream(resultSet, jsonWriter);
Similarly to ResultSet, JsonWriter must be closed as well.
I've written the above code for SQLite, but it should work for MySQL too. For example, the test database created and populated with the following SQL statements:
CREATE TABLE `table` (i NUMBER NOT NULL, s TEXT NOT NULL);
INSERT INTO `table` (i, s) VALUES (1, 'foo');
INSERT INTO `table` (i, s) VALUES (2, 'bar');
INSERT INTO `table` (i, s) VALUES (3, 'baz');
will result in
[{"i":1,"s":"foo"},{"i":2,"s":"bar"},{"i":3,"s":"baz"}]
for both object model and streaming approaches.

How to process a flat file with JSON string as a part of each line, into CSV file Using PIG Loader?

I have a file in HDFS as
44,UK,{"names":{"name1":"John","name2":"marry","name3":"stuart"},"fruits":{"fruit1":"apple","fruit2":"orange"}},31-07-2016
91,INDIA,{"names":{"name1":"Ram","name2":"Sam"},"fruits":{}},31-07-2016
and want to store this into a SCV file as below using PIG loader :
44,UK,names,name1,John,31-07-2016
44,UK,names,name2,Marry,31-07-2016
..
44,UK,fruit,fruit1,apple,31-07-2016
..
91,INDIA,names,name1,Ram,31-07-2016
..
91,INDIA,null,null,Ram,31-07-2016
What should be the PIG script for this ?
Since your record is not a proper JSON string any json storer/loader will not help you. Writing a UDF will be a simpler approach.
UPDATED APPROACH 1 :-
Below UDF and PIG script will work if you are converting your input to tab separated file.
UDF :-
package com.test.udf;
import org.apache.commons.lang3.StringUtils;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*input format :-
* {"names":{"name1":"John","name2":"marry","name3":"stuart"},"fruits": {"fruit1":"apple","fruit2":"orange"}}
*/
public class jsonToTuples extends EvalFunc<DataBag> {
ObjectMapper objectMapper = new ObjectMapper();
TypeReference typeRef = new TypeReference<HashMap<String, Object>>() {
};
#Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
} else {
String jsonRecord = (String) input.get(0);
if (StringUtils.isNotBlank(jsonRecord)) {
try {
List<String> recordList = new ArrayList<String>();
Map<String, Object> jsonDataMap = objectMapper.readValue(jsonRecord, typeRef);
if(jsonDataMap.get("names") != null) {
Map<String, String> namesDataMap = (Map<String, String>) jsonDataMap.get("names");
for(String key : namesDataMap.keySet()){
recordList.add("names" + "," + key + "," + namesDataMap.get(key));
}
}
if(jsonDataMap.get("fruits") != null) {
Map<String, String> fruitsDataMap = (Map<String, String>) jsonDataMap.get("fruits");
for(String key : fruitsDataMap.keySet()){
recordList.add("fruits" + "," + key + "," + fruitsDataMap.get(key));
}
}
DataBag outputBag = BagFactory.getInstance().newDefaultBag();
for( int i = 0 ; i < recordList.size() ; i++){
Tuple outputTuple = TupleFactory.getInstance().newTuple(1);
outputTuple.set(0 , recordList.get(i));
outputBag.add(outputTuple);
}
return outputBag;
}catch(Exception e){
System.out.println("caught exception for ");
e.printStackTrace();
return null;
}
}
}
return null;
}
}
PIG SCRIPT :-
register 'testUDF.jar' ;
A = load 'data.txt' using PigStorage() as (id:chararray , country:chararray , record:chararray , date:chararray);
B = Foreach A generate id, country , FLATTEN(com.test.udf.jsonToTuples(record)) , date ;
dump B ;
OLD APPROACH :-
Below I am mentioning the way I will use in my UDF to read your record if it is comma separated.
As mentined in my below comment try to use magic of split in UDF to separate your fields. I have not tested but here is what I may try in my UDF :-
(please note that I am not sure this is best option - you may want to improve it further.)
String[] strSplit = ((String) input.get(0)).split("," , 3);
String id = strSplit[0] ;
String country = strSplit[1] ;
String jsonWithDate = strSplit[2] ;
String[] datePart = ((String) input.get(0)).split(",");
String date = datePart[datePart.length-1];
/**
* above jsonWithDate should look like -
* {"names":{"name1":"Ram","name2":"Sam"},"fruits":{}},31-07-2016
*
*/
String jsonString = jsonWithDate.replace(date,"").replace(",$", "");
/**
* now use some parser or object mapper to convert jsonString to desired list of values.
*/

Anyone able to get directory listing feature work in Android

It seems that, recent released Google Drive SDK supports directory listing.
https://developers.google.com/drive/v2/reference/files/list
I try to integrate it into my Android app.
package com.jstock.cloud;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import android.util.Log;
import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
import com.google.api.client.googleapis.services.GoogleKeyInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.gson.GsonFactory;
import com.google.api.client.extensions.android2.AndroidHttp;
import com.google.api.services.drive.Drive;
import com.google.api.services.drive.Drive.Files;
import com.google.api.services.drive.model.File;
import com.google.api.services.drive.model.FileList;
import com.jstock.gui.Utils;
public class CloudFile {
public final java.io.File file;
public final long checksum;
public final long date;
public final int version;
private CloudFile(java.io.File file, long checksum, long date, int version) {
this.file = file;
this.checksum = checksum;
this.date = date;
this.version = version;
}
public static CloudFile newInstance(java.io.File file, long checksum, long date, int version) {
return new CloudFile(file, checksum, date, version);
}
public static CloudFile loadFromGoogleDrive(String authToken) {
final HttpTransport transport = AndroidHttp.newCompatibleTransport();
final JsonFactory jsonFactory = new GsonFactory();
GoogleCredential credential = new GoogleCredential();
credential.setAccessToken(authToken);
Drive service = new Drive.Builder(transport, jsonFactory, credential)
.setApplicationName(Utils.getApplicationName())
.setJsonHttpRequestInitializer(new GoogleKeyInitializer(ClientCredentials.KEY))
.build();
List<File> files = retrieveAllFiles(service);
Log.i("CHEOK", "size is " + files.size());
for (File file : files) {
Log.i(TAG, "title = " + file.getTitle());
}
return null;
}
/**
* Retrieve a list of File resources.
*
* #param service Drive API service instance.
* #return List of File resources.
*/
private static List<File> retrieveAllFiles(Drive service) {
List<File> result = new ArrayList<File>();
Files.List request = null;
try {
request = service.files().list();
} catch (IOException e) {
Log.e(TAG, "", e);
return result;
}
do {
try {
FileList files = request.execute();
result.addAll(files.getItems());
request.setPageToken(files.getNextPageToken());
} catch (IOException e) {
Log.e(TAG, "", e);
request.setPageToken(null);
}
} while (request.getPageToken() != null && request.getPageToken().length() > 0);
Log.i("CHEOK", "yup!");
return result;
}
private static final String TAG = "CloudFile";
}
I always get 0 file returned from server, and there isn't any exception being thrown. Is there anything I had missed out?
You have to request access to the full Drive scope to list all files: https://developers.google.com/drive/scopes#requesting_full_drive_scope_for_an_app
If you use the default (and recommended) scope https://www.googleapis.com/auth/drive.file, you will only be able to see files created or opened by the app.