Load XML file to MySQL database using python with column mapping - mysql

I have written a code to load data to MySQL table using pymysql library. I'm loading data to mysql table in following manner:
import pymysql
con = pymysql.connect(host=host,user=user,password=passwd,db=db,port=int(port),autocommit=True,local_infile=1)
sql = "LOAD XML INFILE '" + path + "' INTO TABLE "+ ds_name +"."+table_name +" SET dataset="+ds_name+", factor_date="+factor_date+","+column_map+ " ROWS IDENTIFIED BY '<LoanInfo>'"
cursor.execute(sql)
cursos.commit()
ds_name and factor_date are not comping from xml file so I'm writing them as static across all the rows.
I have a CSV/excel file which contains mapping between XML file columns and MySQL table column name for 100+ columns. I read somewhere that reference column mapping can be added to SQL query as 'SET ABC_AGE = #Age,UNIQUE_ID= #ID, BALANCE=#Money'. I am creating a list of mapping in following manner:
ls = []
for value in zip(map_df['XML Columns'],map_df['SQL Columns']):
ls.append(value[0]+"=#"+value[1])
column_map = ",".join(ls)
My Question is, Is there a better way to approach to load XML file to MySQL using python with mapping?

I found a way to transform xml file into a pandas dataframe and then load it with executemany to mysql databse. Here is a piece of code to convert xml to dataframe:
#reading mapping file and converting mapping to dictionary
import os
import pandas as pd
map_path = 'Mapping.xlsx'
if os.path.isfile(map_path):
map_df = pd.read_excel(map_path,worksheet='Mapping')
mapping_dict = pd.Series(map_df['XML Columns'].values,index=map_df['SQL columns']).to_dict()
#Reading XML file
import xml.etree.ElementTree as ET
xml_path = 'test.xml'
if os.path.isfile(xml_path):
root = ET.parse(xml_path).getroot()
#Reading xml elements one by one and storing attributes in a dictionary.
missing_col = set()
xmldf_dict = {"df_dicts":[]}
for elem in root:
df_dict = {}
for k,v in mapping_dict.items():
if k in [list of columns to skip]:
continue
try:
df_dict[k] = elem.attrib[v]
except KeyError:
missing_col.add(k)
xmldf_dict["df_dicts"].append(df_dict)
#Merging missing columns dataframe with xml dataframe
missing_col_df = pd.DataFrame(columns=missing_col)
xml_df = pd.DataFrame(xmldf_dict["df_dicts"])
final_df = pd.concat([xml_df,missing_col_df],axis=1)

Related

How do I load given CSV data file from a given path? (Python)

My professor has instructed me to
Load 3d_classification_data_v0.csv data file, from path: '../mlrefined_datasets/superlearn_datasets/'
use csv or pandas package for reading csv file.
import csv
file = open(“csvfile.csv”)
csvreader = csv.reader(file)
header = []
header = next(csvreader)
rows = []
for row in csvreader:
rows.append(row)

dataframe results are not returned while reading csv file

I m trying to read a csv file, below is the code i used , its not returning any results. In the specified path , the csv file has data in it. I had some issue when i used ValidFile = spark.read.csv(ValidationFileDest, header = True) , for this the result is returned but the data for the columns were interchanges and nulls were assinged thats the reason i applied mode DROPMALFORMED in my code. But it is not returning any result.
parquetextension=".parquet"
BronzeStage_Path = "dbfs:/mnt/bronze/stage/" +parentname+"/" +filename
#validated_path="dbfs:/mnt/bronze/landing/ClaimDenialsSouce/"+parentname+"/"+"current/"+"Valid/"+todayDate+"_"+"CDAValidFile"+extension
# df_sourcefilevalid.repartition(1).write.format(write_format).option("header", "true").save(BronzeStagePath)
# ValidFileSrc_BS= get_csv_files(exception_path)
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local") \
.appName("parquet_example") \
.getOrCreate()
spark.conf.set("spark.sql.csv.parser.columnPruning.enabled",False)
ValidFile = spark.read.format('csv').option("mode","DROPMALFORMED").options(header='true', inferSchema='true').load(ValidationFileDest)
display(ValidFile)
Make sure to check if you are providing the correct file path or the variable of your CSV file. I have repro'd in our environment and was able read the CSV file without any issue
Reading CSV file :
filepath="dbfs:/FileStore/test11-1.csv"
df11 = spark.read.format("csv").option("mode", "DROPMALFORMED").option("header", "true").load(filepath)
display(df11)

In Palantir Foundry, can I find which CSV file is causing schema errors in a dataset?

I'm seeing errors like the following when building downstream of some datasets containing CSV files:
Caused by: java.lang.IllegalStateException: Header specifies 185 column types but line split into 174: "SUSPECT STRING","123...
or
Caused by: java.lang.RuntimeException: Error while encoding: java.lang.RuntimeException: Exception parsing 'SUSPECT STRING' into a IntegerType$ for column "COLOUR_ID": Unable to deserialize value using com.palantir.spark.parsers.text.converters.IntegerConverter. The value being deserialized was: SUSPECT STRING
Looking at the errors it seems to me like some of my CSV files have the wrong schema. How can I find which ones?
One technique you could use would be to:
create a transform that reads the CSV files in as if they were unstructured text files, then
filter the resulting DataFrame down to just the suspect rows, as identified by the extracts contained in the error message
Below is an example of such a transform:
from pyspark.sql import functions as F
from transforms.api import transform, Input, Output
from transforms.verbs.dataframes import union_many
def read_files(spark_session, paths):
parsed_dfs = []
for file_name in paths:
parsed_df = (
spark_session.read.text(file_name)
.filter(F.col("value").contains(F.lit("SUSPECT STRING")))
.withColumn("_filename", F.lit(file_name))
)
parsed_dfs += [parsed_df]
output_df = union_many(*parsed_dfs, how="wide")
return output_df
#transform(
output_dataset=Output("my_output"),
input_dataset=Input("my_input"),
)
def compute(ctx, input_dataset, output_dataset):
session = ctx.spark_session
input_filesystem = input_dataset.filesystem()
hadoop_path = input_filesystem.hadoop_path
files = [hadoop_path + "/" + file_name.path for file_name in input_filesystem.ls()]
output_df = read_files(session, files)
output_dataset.write_dataframe(output_df)
This would then output the rows of interest along with the paths to the files they're in.

Reading Json files using pyspark

I am trying to read multiple json files from dbfs in databricks.
raw_df = spark.read.json('/mnt/testdatabricks/metrics-raw/',recursiveFileLookup=True)
This returns data for only 35 files whereas there are around 1600 files.
I tried to read some of the files (except those 35) using pandas and it returned data.
However the driver fails when I try to read all 1600 files using pandas.
import pandas as pd
from glob import glob
jsonFiles = glob('/dbfs/mnt/testdatabricks/metrics-raw/***/*.json')
dfList = []
for jsonFile in jsonFiles:
df = pd.read_json(jsonFile)
dfList.append(df)
print("written :", jsonFile )
dfTrainingDF = pd.concat(dfList, axis=0)
Not sure why spark is not able to read all the files.
Try:
spark.read.option("recursiveFileLookup", "true").json("file:///dir1/subdirectory")
Ref: How to make Spark session read all the files recursively?

How to convert json file into table structure in redshift using python

How can I convert JSON file into a table structure in Redshift? I tried the below python code.
import boto3
import json
import os
import sys
import psycopg2
import csv
from collections import defaultdict
def jsonfile(path):
session = boto3.Session(
aws_access_key_id='dfjfkgj',
aws_secret_access_key='sdfg',
region_name='us-west-2')
s3 = session.resource('s3')
bucket= s3.Bucket('ag-redshift-poc')
with open(path, 'rb') as data:
res=json.load(data)
f = open('data.csv','wb')
output = csv.writer(f)
output.writerow(res[0].keys())
for row in res:
output.writerow(row.values())
bucket.put_object(Key=('C:\Python27\data.csv'),Body=res)
print 'success'
def redshift():
co=psycopg2.connect(dbname= 'redshiftpoc', host='shdjf',
port= '5439', user= 'admin', password= 'snd')
curr = co.cursor()
curr.execute("""copy sample from 's3://ag-redshift-poc/testfile/json.txt'
CREDENTIALS 'aws_access_key_id=fdfd;aws_secret_access_key=sxhd'
""")
co.commit()
print 'success'
curr.close()
co.close()
jsonfile('C:\Python27\json.txt')
redshift()
Redshift can directly absorb JSON to COPY into your table. (Though not very efficient).
In your case, modify the COPY query to,
COPY sample FROM 's3://<bucket_name>/<path_to_json>'
CREDENTIALS 'aws_access_key_id=xxxx;aws_secret_access_key=xxxx'
JSON 'auto' ACCEPTINVCHARS;
Please note JSON 'auto' in query. This maps every column in table with keys in JSON file.
More details here in the COPY examples