How to improve the reading efficiency of a file (.CSV, .TXT) with Kotlin - csv

I made a program in JavaFX with Kotlin I managed to make a CSV and TXT reading separated by ";" where I have efficiency problems and I don't know how I could improve the efficiency to make the SQL query construction.
fun generatedDelimited(filePath: String, table: String = "") {
val sourceFile = File(filePath)
var line: String?
var header: Array<String>? = null
val lines: MutableList<List<String>> = ArrayList()
try {
BufferedReader(FileReader(sourceFile)).use { br ->
header = br.readLine().split(";").toTypedArray();
while (br.readLine().also { line = it } != null) {
val values : Array<String> = line!!.split(";").toTypedArray();
lines.add(Arrays.asList(*values))
}
}
} catch (e: IOException) {
e.printStackTrace()
}
val joined = "INSERT INTO $table (${header!!.joinToString(separator = ",")})\n"
var textSelect = "${joined}SELECT * FROM ( \n"
var selectUnion = ""
var lineNo = 1
for (line in lines) {
var columnNo = 0
var comma = ", "
var select = "SELECT "
var union = "UNION ALL\n"
if (lines.size.equals(lineNo)) {
union = ""
}
for (value in line) {
if (columnNo == 1) {
select = ""
}
if (line.size.equals(columnNo+1)) {
comma = " FROM DUAL \n$union"
}
selectUnion += "$select'$value' as ${header!![columnNo]}$comma"
columnNo++
}
lineNo++
}
textSelect += "$selectUnion);"
querySQL.text = textSelect
}
Result:
INSERT INTO werwsf (DATA1,DATA2,DATA3,DATA4,DATA5)
SELECT * FROM (
SELECT 'HOLA1' as DATA1, 'HAKA2' as DATA2, 'HAD3' as DATA3, '' as DATA4, 'ASDAD5' as DATA5 FROM DUAL
UNION ALL
SELECT 'HOLA6' as DATA1, 'HAKA7' as DATA2, 'HAD8' as DATA3, 'FA9' as DATA4, 'ASDAD10' as DATA5 FROM DUAL
);
Is there a way to improve efficiency? With 1600 rows it takes 5 minutes
Thank you.

this should be an optimised version of your code:
I used kotlin standard joinToString functions, which uses StringBuilder under the hood like #0009laH advised. I also removed the redundant list <-> array conversions and replaced the splitting and then joining back the first line (header) by the replace function, because it has the same effect as in the original code and it is faster. All these changes should result in faster, more readable and more concise code
fun generatedDelimited(filePath: String, table: String = "") {
val sourceFile = File(filePath)
val fileLines: List<String> = sourceFile.readLines()
val header: String = fileLines.first().replace(';', ',')
val lines: List<List<String>> = fileLines.drop(1).map { line ->
line.split(";")
}
val selectUnion = lines.joinToString(separator = "UNION ALL\n") { line ->
line.withIndex().joinToString(separator = ", ", prefix = "SELECT", postfix = " FROM DUAL\n") { (columnNo, value) ->
"'$value' as ${header[columnNo]}"
}
}
querySQL.text = "INSERT INTO $table ($header)\nSELECT * FROM ( \n$selectUnion);"
}

Related

How to rename a duplicate column inside the nested JSON using spark scala dynamically

I have a nested JSON dataset that is flattened using Spark scala. Further, when I am trying to flatten it, it is not flattened due to duplicate columns. So, I have to rename it first and then flatten it but, I am unable to do so. Below is my code to flatten the dataset:
def flattenDataframe(df: DataFrame): DataFrame = {
//getting all the fields from schema
val fields = df.schema.fields
val fieldNames = fields.map(x => x.name)
//length shows the number of fields inside dataframe
val length = fields.length
for (i <- 0 to fields.length - 1) {
val field = fields(i)
val fieldtype = field.dataType
val fieldName = field.name
fieldtype match {
case arrayType: ArrayType =>
val fieldName1 = fieldName
val fieldNamesExcludingArray = fieldNames.filter(_ != fieldName1)
val fieldNamesAndExplode = fieldNamesExcludingArray ++ Array(s"explode_outer($fieldName1) as $fieldName1")
//val fieldNamesToSelect = (fieldNamesExcludingArray ++ Array(s"$fieldName1.*"))
val explodedDf = df.selectExpr(fieldNamesAndExplode: _*)
return flattenDataframe(explodedDf)
case structType: StructType =>
val childFieldnames = structType.fieldNames.map(childname => fieldName + "." + childname)
val newfieldNames = fieldNames.filter(_ != fieldName) ++ childFieldnames
val renamedcols = newfieldNames.map(x => (col(x.toString()).as(x.toString().replace(".", "_").replace("$", "_").replace("__", "_").replace(" ", "").replace("-", ""))))
val explodedf = df.select(renamedcols: _*)
return flattenDataframe(explodedf)
case _ =>
}
}
df
}
After flattening the dataset it looks like below:
I have used the below loop for renaming duplicate columns but it is working fine for the JSON elements which are on the same level:
val orgList = df.columns.toList // take the original list
val dupsList = orgList.map(.toLowerCase()).diff(orgList.map(.toLowerCase()).distinct).distinct //take the duplicate columns list in lower case
var i = 1
var newDf: DataFrame = df
for (dups <- dupsList) {
i=1
for (key <- orgList) {
if (key.toLowerCase() == dups) {
newDf = newDf.withColumnRenamed(key, key + "_" + i)
i += 1
//println(i)
}
}
}
How do I rename it before flattening it?

How to update all columns at once in Mysql

There is a database with 200 columns, how to update data from 21 to 200, not in this way -
Set Column21 = NEW.Column21,
Column22 = NEW.Column22,
Column23 = NEW.Column23,
Column24 = NEW.Column24,
Column25 = NEW.Column25...
//You can loop through 22 to 200 and make the string or access property
let sql = "Set",i=22;
[...Array(179)].forEach((_, l) => {
let column = 'Column'+i;
let val = "some val"; //access value
sql += ` ${column} = ${val} `;
i++;
});
console.log(sql);

how to parse multiple JSON structures in spark program

I am working on parsing logs(Json format) in Scala. I don't know how to proceed. I may get different kinds of logs to be processed.
how do i write/design my code to handle different types of Json structures?
can i give my Scala program a schema and let it parse?
I wrote some code using Object mapper and read through the nodes but i want a more structure agnostic approach.
I am not sure where to start. please point me to some reading or examples. i tried to google or search in Stackoverflow resulting in too many examples and it is confusing as i am learning Scala also.
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Calendar;
import org.apache.spark.sql.hive.HiveContext
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import org.apache.spark.rdd.RDD;
sc.setLogLevel("OFF");
val args = sc.getConf.get("spark.driver.args").split("\\s+")
args.foreach(println);
var envStr = "dev";
var srcStr = "appm"
val RootFolderStr = "/source_folder/";
val DestFolderStr = "/dest_folder/";
val dateformatter = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss.SSS'Z'");
val formatter = new SimpleDateFormat("yyyy-MM-dd");
val theMonthFormatter = new SimpleDateFormat("yyyy-MM");
var fromDay: Date = formatter.parse("2018-04-29");
var toDay: Date = formatter.parse("2018-05-01");
if (args.length < 2) {
printf("usage: need at least 2 parameters in spark.driver.args");
sys.exit(2);
}
envStr = args(0).toLowerCase();
srcStr = args(1).toLowerCase();
if (args.length == 4) {
fromDay = formatter.parse(args(2));
toDay = formatter.parse(args(3));
}
if (args.length == 2) {
// default to be yesterday to today
toDay = formatter.parse(formatter.format(Calendar.getInstance().getTime()));
val previousDay = Calendar.getInstance();
previousDay.add(Calendar.DATE, -1);
fromDay = formatter.parse(formatter.format(previousDay.getTime()));
}
// get the sub-folder for the monthly partition
val monthFolder = theMonthFormatter.format(fromDay);
var rootFolder = RootFolderStr.replaceAll("ENV", envStr) + monthFolder;
rootFolder = rootFolder.replaceAll("SRC", srcStr);
val destFolder = DestFolderStr.replaceAll("ENV", envStr);
var toCalendar = Calendar.getInstance();
toCalendar.setTime(toDay);
toCalendar.add(Calendar.DATE, 1);
// need to consider the case across the month boundary
val toDay2 = formatter.parse(formatter.format(toCalendar.getTime()));
// filter out .tmp files and 0-size files
// .tmp files are not safe to read from, it's possible that the files are under updating by Flume job and the message data is incomplete
// when the Spark job starts to read from it.
val pathInfos = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(rootFolder));
// filter out the 0-length files, .tmp files which is of today
val allfiles = pathInfos.filter(fileStatus => {
if (fileStatus.getLen == 0)
false
else {
val aPath = fileStatus.getPath().getName();
// use the modification time is more accurate.
val lastTime = fileStatus.getModificationTime();
val aDate = new Date(lastTime);
// all files between fromDay and toDay2
aDate.after(fromDay) && aDate.before(toDay2);
}
}
).map(_.getPath.toString);
case class event_log(
time_stp: Long,
msg_sze: Int,
msg_src: String,
action_path: String,
s_code: Int,
s_desc: String,
p_code: String,
c_id: String,
m_id: String,
c_ip: String,
c_gp: String,
gip: String,
ggip: String,
rbody: String
);
def readEvents(fileList: Array[String], msgSrc: String, fromTS: Long, toTS: Long): RDD[(event_log)] = {
val records =
sc.sequenceFile[Long, String](fileList.mkString(","))
.filter((message) => {
(message._1 >= fromTS && message._1 < toTS);
}
)
val eventLogs = records.map((message) => {
val time_stp = message._1;
var msg_sze = message._2.length();
var c_id = ""
var m_id = "";
var p_code = "";
var c_ip = "";
var c_gp = "";
var gip = "";
var ggip = "";
var rbody = "";
var action_path = "";
var s_code: Int = 200;
var s_desc = "";
try {
// parse the message
val mapper = new ObjectMapper();
val aBuff = message._2.getBytes();
val root = mapper.readTree(aBuff);
var aNode = root.path("rbody");
rbody = aNode.textValue();
if (rbody != null && rbody.length() > 0) {
val mapper_2 = new ObjectMapper();
val aBuff_2 = rbody.getBytes();
var root2 = mapper_2.readTree(aBuff_2);
aNode = root2.path("p_code");
if (aNode != null && aNode.isValueNode())
p_code = String.valueOf(aNode.intValue());
aNode = root2.path("mkay");
if (aNode != null && aNode.isObject()) {
root2 = aNode;
}
{
aNode = root2.get("c_id");
if (aNode != null && aNode.isValueNode())
c_id = aNode.textValue();
aNode = root2.get("m_id");
if (aNode != null && aNode.isValueNode()) {
m_id = String.valueOf(aNode.intValue());
}
}
}
aNode = root.path("c_ip");
c_ip = aNode.textValue();
aNode = root.path("c_gp");
c_gp = aNode.textValue();
aNode = root.path("gip");
gip = aNode.textValue();
aNode = root.path("ggip");
ggip = aNode.textValue();
aNode = root.path("action_path");
action_path = aNode.textValue();
aNode = root.path("s_code");
val statusNodeValue = aNode.textValue().trim();
s_code = Integer.valueOf(statusNodeValue.substring(0, 3));
s_desc = statusNodeValue.substring(3).trim();
}
catch {
// return empty string as indicator that it's not a well-formatted JSON message
case jex: JsonParseException => {
msg_sze = 0
};
case ioEx: java.io.IOException => {
msg_sze = 0
};
case rtEx: JsonMappingException => {
msg_sze = 0
};
}
event_log(time_stp, msg_sze, msgSrc, action_path, s_code, s_desc,
p_code, c_id, m_id,
c_ip, c_gp, gip, ggip,
rbody);
});
eventLogs;
}
val hiveContext = new HiveContext(sc)
if (allfiles.length == 0)
sys.exit(3);
val fromTime = fromDay.getTime();
val toTime = toDay.getTime();
val events = readEvents(allfiles, srcStr, fromTime, toTime);
val df = hiveContext.createDataFrame(events).coalesce(1);
df.write.parquet(destFolder);
sys.exit(0);

Sort object by another object value inside

Here some complication sorting in my application.
Now i have data object is like following(called pCList):
Object[0]:
Id: 1
comp: Test
med: xyz
condition: valueObject.Condition
Object[1]:
Id: 2
comp: Test1
med: pqr
condition: valueObject.Condition
Object[2]:
Id: 3
comp: Test
med: abc
condition: valueObject.Condition
condition VO Have data like:
condition data1:
conId: 001
cond: abcds
condition data2:
conId: 001
cond: trdfd
condition data3:
conId: 001
cond: dsdsds
For normal sorting i will do as following way;
var sort:ISort = new Sort();
var sortField:ISortField = new SortField("med");
sort.fields = [sortField];
if(pCList != null)
{
pCList.sort = sort;
pCList.refresh();
}
In which pcList is sort by med.
But now, I want to sort by condition.cond
like first come which have cond value abcds then dsdsds then trdfd and so on...
I Have tried it using:
var sort:ISort = new Sort();
var sortField:ISortField = new SortField("condition.cond");
sort.fields = [sortField];
But not succeed. Any help is greatly appreciated.
ISort has a property compareFunction, that can be used for custom sorting. See example below.
var sort:ISort = new Sort();
sort.compareFunction = function(a:Object, b:Object, fields:Array = null):int {
var conditionA:String = a.Condition.cond;
var conditionB:String = b.Condition.cond;
if (conditionA < conditionB) {
return -1;
} else if (conditionA > conditionB) {
return 1;
} else {
return 0;
}
};

Unsupported overload used for query operator 'Intersect'

I am able to do this query just fine with the test repository which is In Memory
when I move to the sqlRepository I get this error
Unsupported overload used for query operator 'Intersect'.
I assume it is because sending the query to sql is too complicated for Linq to Sql to do when it is not dealing with the Model.Model.Talent Type. Is there some way around doing a search like this with Intersect?
thanks
public class TalentService : ITalentService
{
ITalentRepository _repository = null;
private IQueryable<Talent> BasicSearch(string searchExpression)
{
IQueryable<Talent> t;
string[] sa = searchExpression.Trim().ToLower().Replace(" ", " ").Split(' ');
t = _repository.GetTalents();
foreach (string s in sa)
{
t = t.Intersect(AddBasicSearch(s), new TalentComparer());
}
return t;
}
private IQueryable<Talent> AddBasicSearch(string s)
{
IQueryable<Talent> t2 = _repository.GetTalents()
.Where(tal => tal.EyeColor.ToString().ToLower().Contains(s)
|| tal.FirstName.ToLower().Contains(s)
|| tal.LastName.ToLower().Contains(s)
|| tal.LanguagesString.ToLower().Contains(s)
);
return t2;
}
}
public class SqlTalentRepository:ITalentRepository
{
public IQueryable<Model.Model.Talent> GetTalents()
{
var tal = from t in _db.Talents
let tLanguage = GetTalentLanguages(t.TalentID)
where t.Active == true
select new Model.Model.Talent
{
Id = t.TalentID,
FirstName = t.FirstName,
LastName = t.LastName,
TalentLanguages = new LazyList<Model.Model.TalentLanguage>(tLanguage),
LanguagesString = t.TalentLanguages.ToLanguageNameString(_LanguageRepository.GetLanguages())
};
return tal ;
}
public IQueryable<Model.Model.TalentLanguage> GetTalentLanguages(int iTalentId)
{
var q = from y in this.talentLanguageList
let Languages = _LanguageRepository.GetLanguages()
where y.TalentId == iTalentId
select new Model.Model.TalentLanguage
{
TalentLanguageId = y.TalentLanguageId,
TalentId = y.TalentId,
LanguageId = y.LanguageId,
Language = Languages.Where(x => x.LanguageId == y.LanguageId).SingleOrDefault()
};
return q.AsQueryable<Model.Model.TalentLanguage>();
}
}
public static class TalentExtensions
{
public static string ToLanguageNameString(this IEnumerable<TalentLanguage> source
, IEnumerable<Model.Model.Language> allLanguages)
{
StringBuilder sb = new StringBuilder();
const string del = ", ";
foreach (TalentLanguage te in source)
{
sb.AppendFormat("{0}{1}", allLanguages
.Where(x => x.LanguageId == te.LanguageID).SingleOrDefault().LanguageName, del);
}
string sReturn = sb.ToString();
if (sReturn.EndsWith(del))
sReturn = sReturn.Substring(0, sReturn.Length - del.Length);
return sReturn;
}
}
public class TestTalentRepository : ITalentRepository
{
IList<Talent> talentList;
public TestTalentRepository(ILanguageRepository _LanguageRepo )
{
this._LanguageRepository = _LanguageRepo;
talentList = new List<Talent>();
talentLanguageList = new List<TalentLanguage>();
for (int i = 0; i < 55; i++)
{
var t = new Talent();
t.Id = i;
t.FirstName = (i % 3 == 0) ? "Ryan" : "Joe";
t.LastName = (i % 2 == 0) ? "Simpson" : "Zimmerman";
AddLanguagesToTestTalent(i, t);
talentList.Add(t);
}
}
private void AddLanguagesToTestTalent(int i, Talent t)
{
IList<Language> Languages = _LanguageRepository.GetLanguages().ToList<Language>();
Random rLanguages = new Random();
int numLanguages = rLanguages.Next(Languages.Count - 1) + 1;
t.TalentLanguages = new LazyList<TalentLanguage>();
for (int j = 0; j < numLanguages; j++)
{
var x = new TalentLanguage();
x.TalentLanguageId = j;
x.TalentId = i;
Random random2 = new Random();
int rand = random2.Next(Languages.Count);
var y = Languages.ElementAtOrDefault(rand);
Languages.RemoveAt(rand);
x.Language = y;
x.LanguageId = y.LanguageId;
t.TalentLanguages.Add(x);
}
}
public IQueryable<Talent> GetTalents()
{
var ts = from t in this.talentList
let tLanguage = GetTalentLanguages(t.Id)
where t.Active == true
select new Model.Model.Talent
{
Id = t.Id,
FirstName = t.FirstName,
LastName = t.LastName,
TalentLanguages = new LazyList<Model.Model.TalentLanguage>(tLanguage),
LanguagesString = t.TalentLanguages.ToLanguageNameString(_LanguageRepository.GetLanguages()),
City = t.City,
};
return ts.AsQueryable<Model.Model.Talent>();
}
public IQueryable<Model.Model.TalentLanguage> GetTalentLanguages(int iTalentId)
{
var q = from y in this.talentLanguageList
let Languages = _LanguageRepository.GetLanguages()
where y.TalentId == iTalentId
select new Model.Model.TalentLanguage
{
TalentLanguageId = y.TalentLanguageId,
TalentId = y.TalentId,
LanguageId = y.LanguageId,
Language = Languages.Where(x => x.LanguageId == y.LanguageId).SingleOrDefault()
};
return q.AsQueryable<Model.Model.TalentLanguage>();
}
}
If you're trying to find entries which match all of those criteria, you just need multiple where clauses:
private static readonly char[] SplitDelimiters = " ".ToCharArray();
private IQueryable<Talent> BasicSearch(string search)
{
// Just replacing " " with " " wouldn't help with "a b"
string[] terms = search.Trim()
.ToLower()
.Split(SplitDelimiters,
StringSplitOptions.RemoveEmptyEntries);
IQueryable<Talent> query = _repository.GetTalents();
foreach (string searchTerm in terms)
{
query = AddBasicSearch(query, searchTerm);
}
return query;
}
private IQueryable<Talent> AddBasicSearch(IQueryable<Talent> query, string s)
{
return query.Where(tal =>
tal.EyeColor.ToString().ToLower().Contains(s)
|| tal.FirstName.ToLower().Contains(s)
|| tal.LastName.ToLower().Contains(s)
|| tal.LanguagesString.ToLower().Contains(s)
);
}