CSV into MySQL using load data - mysql
I'm hoping to insert the contents of a CSV into my table using MySQL loadfile, however, everytime I do so with the following command a number of rows are dropped
LOAD DATA INFILE 'new.csv' INTO TABLE Example
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
The CSV data I am hoping to insert is the Free Company Product Data supplied by Companies House (http://download.companieshouse.gov.uk/en_output.html)
I'd greatly appreciate any help.
Thanks!
Given this table definition, from the provided data specification here:
create table companieshouse
(
CompanyName varchar(160),
CompanyNumber varchar(8),
RegAddressCareOf varchar(100),
RegAddressPOBox varchar(10),
RegAddressAddressLine1 varchar(300),
RegAddressAddressLine2 varchar(300),
RegAddressPostTown varchar(50),
RegAddressCounty varchar(60),
RegAddressCountry varchar(50),
RegAddressPostCode varchar(20),
CompanyCategory varchar(100),
CompanyStatus varchar(70),
CountryOfOrigin varchar(50),
DissolutionDate date,
IncorporationDate date,
AccountsAccountRefDay integer,
AccountsAccountRefMonth integer,
AccountsNextDueDate date,
AccountsLastMadeUpDate date,
AccountsAccountCategory varchar(30),
ReturnsNextDueDate date,
ReturnsLastMadeUpDate date,
MortgagesNumMortCharges integer,
MortgagesNumMortOutstanding integer,
MortgagesNumMortPartSatisfied integer,
MortgagesNumMortSatisfied integer,
SICCodeSicText_1 varchar(170),
SICCodeSicText_2 varchar(170),
SICCodeSicText_3 varchar(170),
SICCodeSicText_4 varchar(170),
LimitedPartnershipsNumGenPartners integer,
LimitedPartnershipsNumLimPartners integer,
URI varchar(47),
PreviousName1CONDATE date,
PreviousName1CompanyName varchar(160),
PreviousName2CONDATE date,
PreviousName2CompanyName varchar(160),
PreviousName3CONDATE date,
PreviousName3CompanyName varchar(160),
PreviousName4CONDATE date,
PreviousName4CompanyName varchar(160),
PreviousName5CONDATE date,
PreviousName5CompanyName varchar(160),
PreviousName6CONDATE date,
PreviousName6CompanyName varchar(160),
PreviousName7CONDATE date,
PreviousName7CompanyName varchar(160),
PreviousName8CONDATE date,
PreviousName8CompanyName varchar(160),
PreviousName9CONDATE date,
PreviousName9CompanyName varchar(160),
PreviousName10CONDATE date,
PreviousName10CompanyName varchar(160),
ConfStmtNextDueDate date,
ConfStmtLastMadeUpDate date
);
This will load data from the provided .csv files into the table;
LOAD DATA INFILE '/var/lib/mysql-files/BasicCompanyData-2017-03-06-part1_5.csv'
INTO TABLE companieshouse
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
ESCAPED BY ""
LINES TERMINATED BY '\n'
IGNORE 1 LINES
(
CompanyName,
CompanyNumber,
RegAddressCareOf,
RegAddressPOBox,
RegAddressAddressLine1,
RegAddressAddressLine2,
RegAddressPostTown,
RegAddressCounty,
RegAddressCountry,
RegAddressPostCode,
CompanyCategory,
CompanyStatus,
CountryOfOrigin,
#DissolutionDate,
#IncorporationDate,
#AccountsAccountRefDay,
#AccountsAccountRefMonth,
#AccountsNextDueDate,
#AccountsLastMadeUpDate,
AccountsAccountCategory,
#ReturnsNextDueDate,
#ReturnsLastMadeUpDate,
#MortgagesNumMortCharges,
#MortgagesNumMortOutstanding,
#MortgagesNumMortPartSatisfied,
#MortgagesNumMortSatisfied,
SICCodeSicText_1,
SICCodeSicText_2,
SICCodeSicText_3,
SICCodeSicText_4,
#LimitedPartnershipsNumGenPartners,
#LimitedPartnershipsNumLimPartners,
URI,
#PreviousName1CONDATE,
PreviousName1CompanyName,
#PreviousName2CONDATE,
PreviousName2CompanyName,
#PreviousName3CONDATE,
PreviousName3CompanyName,
#PreviousName4CONDATE,
PreviousName4CompanyName,
#PreviousName5CONDATE,
PreviousName5CompanyName,
#PreviousName6CONDATE,
PreviousName6CompanyName,
#PreviousName7CONDATE,
PreviousName7CompanyName,
#PreviousName8CONDATE,
PreviousName8CompanyName,
#PreviousName9CONDATE,
PreviousName9CompanyName,
#PreviousName10CONDATE,
PreviousName10CompanyName,
#ConfStmtNextDueDate,
#ConfStmtLastMadeUpDate)
SET DissolutionDate = IF(#DissolutionDate = '', NULL, STR_TO_DATE(#DissolutionDate, '%d/%m/%Y')),
IncorporationDate = IF(#IncorporationDate = '', NULL, STR_TO_DATE(#IncorporationDate, '%d/%m/%Y')),
AccountsNextDueDate = IF(#AccountsNextDueDate = '', NULL, STR_TO_DATE(#AccountsNextDueDate, '%d/%m/%Y')),
AccountsLastMadeUpDate = IF(#AccountsLastMadeUpDate = '', NULL, STR_TO_DATE(#AccountsLastMadeUpDate, '%d/%m/%Y')),
ReturnsNextDueDate = IF(#ReturnsNextDueDate = '', NULL, STR_TO_DATE(#ReturnsNextDueDate, '%d/%m/%Y')),
ReturnsLastMadeUpDate = IF(#ReturnsLastMadeUpDate = '', NULL, STR_TO_DATE(#ReturnsLastMadeUpDate, '%d/%m/%Y')),
PreviousName1CONDATE = IF(#PreviousName1CONDATE = '', NULL, STR_TO_DATE(#PreviousName1CONDATE, '%d/%m/%Y')),
PreviousName2CONDATE = IF(#PreviousName2CONDATE = '', NULL, STR_TO_DATE(#PreviousName2CONDATE, '%d/%m/%Y')),
PreviousName3CONDATE = IF(#PreviousName3CONDATE = '', NULL, STR_TO_DATE(#PreviousName3CONDATE, '%d/%m/%Y')),
PreviousName4CONDATE = IF(#PreviousName4CONDATE = '', NULL, STR_TO_DATE(#PreviousName4CONDATE, '%d/%m/%Y')),
PreviousName5CONDATE = IF(#PreviousName5CONDATE = '', NULL, STR_TO_DATE(#PreviousName5CONDATE, '%d/%m/%Y')),
PreviousName6CONDATE = IF(#PreviousName6CONDATE = '', NULL, STR_TO_DATE(#PreviousName6CONDATE, '%d/%m/%Y')),
PreviousName7CONDATE = IF(#PreviousName7CONDATE = '', NULL, STR_TO_DATE(#PreviousName7CONDATE, '%d/%m/%Y')),
PreviousName8CONDATE = IF(#PreviousName8CONDATE = '', NULL, STR_TO_DATE(#PreviousName8CONDATE, '%d/%m/%Y')),
PreviousName9CONDATE = IF(#PreviousName9CONDATE = '', NULL, STR_TO_DATE(#PreviousName9CONDATE, '%d/%m/%Y')),
PreviousName10CONDATE = IF(#PreviousName10CONDATE = '', NULL, STR_TO_DATE(#PreviousName10CONDATE, '%d/%m/%Y')),
AccountsAccountRefDay = NULLIF(#AccountsAccountRefDay, ''),
AccountsAccountRefMonth = NULLIF(#AccountsAccountRefMonth, '') ,
MortgagesNumMortCharges = NULLIF(#MortgagesNumMortCharges, ''),
MortgagesNumMortOutstanding = NULLIF(#MortgagesNumMortOutstanding, ''),
MortgagesNumMortPartSatisfied = NULLIF(#MortgagesNumMortPartSatisfied, ''),
MortgagesNumMortSatisfied = NULLIF(#MortgagesNumMortSatisfied, ''),
LimitedPartnershipsNumGenPartners = NULLIF(#LimitedPartnershipsNumGenPartners, ''),
LimitedPartnershipsNumLimPartners = NULLIF(#LimitedPartnershipsNumLimPartners, '')
;
... loaded the data without any errors or warnings:
Query OK, 849999 rows affected (19.43 sec)
Records: 849999 Deleted: 0 Skipped: 0 Warnings: 0
From one side - LOAD DATA INFILE, fasted method, but real life always require some checks, transformations and other business logic.
And normal practice - use ETL tools rather than direct import
or multi stages processes - clean the data (check, log errors, transform, add calculated columns and etc on 1st step), than import final result.
Now there are many excellent OpenSource tools for this:
Talend - http://www.talend.com
StreamSets - http://www.streamsets.com
Apache NiFi - https://nifi.apache.org, https://hortonworks.com/apache/nifi/
transforming Your import/export logic You can realise all steps, such as:
download files
parse them
make any changes and lookups
and after use direct loading from tools or call bulk load if it prefer
good example recommendations from comments for Your other question - https://dba.stackexchange.com/questions/168194/mysql-select-and-sort-performance-v-large-table - easy add date column from wrong formatted files
Related
Mysql csv file selecting multiple value in one column
,PMID,LastName,ForeName,Initials,Affiliation 0,1,"['Makar', 'McMartin', 'Palese', 'Tephly']","['A B', 'K E', 'M', 'T R']","['AB', 'KE', 'M', 'TR']", ,,,,, enter image description here this is the csv file I want to divide this to look like this PMID Name Affiliation 1 Makar_A B_AB 1 MCMartin_K E_KE 1 Palese_M_M 1 Tephly_T R_TR below is the code I wrote in mysql workbench DROP TABLE IF EXISTS AuthorTBL; CREATE TABLE IF NOT EXISTS AuthorTBL ( PMID varchar(100), LastName varchar(50), ForeName varchar(50), Initials varchar(50), Affiliation varchar(250) ); LOAD DATA INFILE 'abcd.csv' INTO TABLE AuthorTBL FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 ROWS (#num, #PMID, #LastName, #ForeName, #Initials, #Affiliation) SET PMID = #PMID, LastName = #LastName, ForeName = #ForeName, Initials = #Initials, Affiliation = #Affiliation; How can I change my code to make it work?
MySQL get max(PRIMARY KEY) for each compound group
I've got the use case to version objects (identified by objectOwnerId and objectId group). I insert rows to ledger table with their respective hashes. The order of the ledger table is identified by the compound PRIMARY KEY and its timestamp up to microsecond precision + additional 3 byte entropy at the end to prevent collisions (in case multiple rows gets inserted at the same microsecond). Once data is stored I need efficient way to get the latest hash for multiple objects at once. I've came up with a query (please see end of this post) which is built from sub-selects with JOIN and GROUP BY, but it's pretty complex I think and I am looking for ways to address my problem in a simpler (if possible) way. Is there any way for improvement? It would've been simpler if I have PRIMARY KEY which isn't COMPOUND, in which case I could pass the max() value upwards, however that's not the case. I was also thinking if I could merge my TIMESTAMP(6) - 7 bytes with BINARY(3) - 3 bytes and store it as BINARY(10), but wasn't sure if that's easily possible. Please find the schema, test data and SELECT queries below. This is my table: CREATE TABLE `ledger` ( `objectOwnerId` CHAR(10) NOT NULL, `objectId` VARCHAR(50) NOT NULL, `objectHash` BINARY(16) NOT NULL, `timestamp` TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), `timestampAdditionalEntropy` BINARY(3) NOT NULL, PRIMARY KEY (`timestamp`, `timestampAdditionalEntropy`), UNIQUE(`objectHash`), INDEX(`objectId`(10)) ); Let's insert some values: INSERT INTO ledger (objectOwnerId, objectId, objectHash, timestampAdditionalEntropy) VALUES ('owneraaaaa', 'ida', unhex(substring(sha1(random_bytes(16)), 1, 32)), random_bytes(3)); INSERT INTO ledger (objectOwnerId, objectId, objectHash, timestampAdditionalEntropy) VALUES ('owneraaaaa', 'ida', unhex(substring(sha1(random_bytes(16)), 1, 32)), random_bytes(3)); INSERT INTO ledger (objectOwnerId, objectId, objectHash, timestampAdditionalEntropy) VALUES ('owneraaaab', 'idb', unhex(substring(sha1(random_bytes(16)), 1, 32)), random_bytes(3)); INSERT INTO ledger (objectOwnerId, objectId, objectHash, timestampAdditionalEntropy) VALUES ('owneraaaab', 'idb', unhex(substring(sha1(random_bytes(16)), 1, 32)), random_bytes(3)); INSERT INTO ledger (objectOwnerId, objectId, objectHash, timestampAdditionalEntropy) VALUES ('owneraaaab', 'idb', unhex(substring(sha1(random_bytes(16)), 1, 32)), random_bytes(3)); We've got this dataset: # objectOwnerId, objectId, objectHash, timestamp, HEX(CAST(timestampAdditionalEntropy AS CHAR(6) CHARACTER SET utf8)) #'owneraaaab', 'idb', 'A8D3B63EFC6C63FD996B8D1931FBF748', '2019-05-29 11:38:12.353521', '725E3D' #'owneraaaab', 'idb', '9B7395F9EE2F2363BA89C7FBAEDDBB54', '2019-05-29 11:38:12.352524', '8B8162' #'owneraaaab', 'idb', '80393C5FF4492342D073B5F8B3388EC2', '2019-05-29 11:38:12.351569', 'FEAA02' #'owneraaaaa', 'ida', '0D84F725ACAC87838C34742CA00BBEF7', '2019-05-29 11:38:12.350648', '41E425' #'owneraaaaa', 'ida', '9A82C936A25C4648BFB75B692850841B', '2019-05-29 11:38:12.349625', '470685' returned by this query: select objectOwnerId, objectId, HEX(CAST(objectHash AS CHAR(32) CHARACTER SET utf8)) as objectHash, timestamp, HEX(CAST(timestampAdditionalEntropy AS CHAR(6) CHARACTER SET utf8)) from ledger order by timestamp desc, timestampAdditionalEntropy desc; I need to get this: # objectOwnerId, objectId, objectHash, timestamp, HEX(CAST(s.timestampAdditionalEntropy AS CHAR(6) CHARACTER SET utf8)) #owneraaaaa, ida, 0D84F725ACAC87838C34742CA00BBEF7, 2019-05-29 11:38:12.350648, 41E425 #owneraaaab, idb, A8D3B63EFC6C63FD996B8D1931FBF748, 2019-05-29 11:38:12.353521, 725E3D which this query can return: select s.objectOwnerId, s.objectId, HEX(CAST(objectHash AS CHAR(32) CHARACTER SET utf8)) as objectHash, s.timestamp, HEX(CAST(s.timestampAdditionalEntropy AS CHAR(6) CHARACTER SET utf8)) from ( select s.objectOwnerId, s.objectId, s.timestamp, max(i.timestampAdditionalEntropy) as timestampAdditionalEntropy from ( select objectOwnerId, objectId, max(timestamp) as timestamp from ledger where ((objectOwnerId = 'owneraaaaa' AND objectId = 'ida') OR (objectOwnerId = 'owneraaaab' AND objectId = 'idb')) group by objectOwnerId, objectId ) s JOIN ledger i on i.objectOwnerId = s.objectOwnerId and i.objectId = s.objectId and i.timestamp = s.timestamp group by objectOwnerId, objectId, timestamp ) s JOIN ledger i on i.objectOwnerId = s.objectOwnerId and i.objectId = s.objectId and i.timestamp = s.timestamp and i.timestampAdditionalEntropy = s.timestampAdditionalEntropy
How to get a String from not null values of the columns row wise in MySQL
I have a table in MySQL like below, Now I want to get an output like below, id |username |accessLevel ============================ 6 |deepaku8 |arch[ALL] 7 |kirajama |arch[ALL]- geo[APJ] 8 |asau |arch[Data Center]- geo[EMEAR_REGION] Is there any way to achieve this result by SQL query ?
This does the basics for you table (guessing the columns you want) SELECT id, username, CONCAT_WS(' - ', IF(arch IS NULL OR arch = '', NULL, CONCAT('arch', '[', arch, ']')), IF(geo IS NULL OR geo = '', NULL, CONCAT('geo', '[', geo, ']')), IF(theater IS NULL OR theater = '', NULL, CONCAT('theater', '[', theater, ']')), IF(operation IS NULL OR operation = '', NULL, CONCAT('operation', '[', operation, ']')), IF(region IS NULL OR region = '', NULL, CONCAT('region', '[', region, ']')), IF(country IS NULL OR country = '', NULL, CONCAT('country', '[', country, ']')), IF(technology IS NULL OR technology = '', NULL, CONCAT('technology', '[', technology, ']')), IF(subTechnology IS NULL OR subTechnology = '', NULL, CONCAT('subTechnology', '[', subTechnology, ']')) ) AS accessLevel FROM some_table
You query in SQL using the SELECT statement. SQL commands are implemented as SQL command strings. Here is a good tutorial by Microsoft.
You can use concat_ws to do this. concat returns null if any of the values is null and concat_ws ignores null values during concatenation. select id,username, concat_ws('-',concat('arch','[',if(arch='',null,arch),']'), concat('geo','[',if(geo='',null,geo),']'), concat('technology','[',if(technology='',null,technology),']') --add more values as required ) from tablename
HeidiSQL import csv empty fields
I want to import a csv file to my MySQL database with HeidiSQL. But some of my fields are empty. What could I do to let HeidiSQL know these empty values have to be seen as NULL-values? Sample of csv-file (last 2 fields not yet known): NULL;Students Corner;437452182;; Create commands: CREATE TABLE `db`.`customers` ( `company_id` INT NOT NULL AUTO_INCREMENT , `company_name` VARCHAR(40) NULL , `company_number` INT NULL , `company_vat` INT NULL , `company_zip` INT NULL, PRIMARY KEY (`company_id`) ); I get these error: Incorrect integer value: '' for column 'company_id' at row 1 */ Incorrect integer value: '' for column 'company_vat' at row 1 */ Incorrect integer value: '' for column 'company_zip' at row 1 */ etc
If solved it by writing \N in each empty field instead of writing NULL !
You can import CSV files into MySQL using a LOAD DATA INFILE query. In your case, you would write something like this: LOAD DATA INFILE filename.txt INTO TABLE customers FIELDS TERMINATED BY ';' LINES TERMINATED BY '\n' (#id, #name, #number, #vat, #zip) SET company_id = (CASE WHEN #id='' THEN NULL ELSE #id END), company_name = (CASE WHEN #name='' THEN NULL ELSE #name END), company_number = (CASE WHEN #number='' THEN NULL ELSE #number END), company_vat = (CASE WHEN #vat='' THEN NULL ELSE #vat END), company_zip = (CASE WHEN #zip='' THEN NULL ELSE #zip END) (you may need to adjust this, depending on your end of line markers, etc, but this should be pretty close to what you need)
find closest lat long to a input lat long Sql server 2008
Hi I have a point cloud in my database (Sql server 2008 spatial). That is about 6 million records. There are 3 columns: id, value , geom. What is the most optimized way of getting the 'value' at input lat long ?? I am new to spatial queries in SQL Server 2008. Can some one post simple example of finding the point in geom column, matching or closest from the input lat long? Thanks Shaunak
Assuming that you have a table Wifi with columns: id, placeName, locationCoord (geography): CREATE TABLE [dbo].[WiFi]( [id] [int] IDENTITY(1,1) NOT NULL, [placeName] [varchar](500) NULL, [locationCoord] [geography] NULL, CONSTRAINT [PK_WiFi] PRIMARY KEY CLUSTERED ([id] ASC)) Here the locationCoord is a geography type. Lets say the input is a latitude and longitude as varchar datatypes. You can get the nearest points/locations by using something like: declare #longitude varchar(50) = '-77.26939916610718', #latitude varchar(50) = '39.168516439779914' declare #ms_at geography, #locationString nvarchar(1000) set #locationString = 'SELECT #ms_at = geography::STGeomFromText(''POINT(' + #longitude + ' ' + #latitude + ')'', 4326)' exec sp_executesql #locationString, N'#ms_at geography OUT', #ms_at OUT select nearPoints.placeName, nearPoints.locationCoord.STDistance(#ms_at) as distance ,RANK() OVER (ORDER BY nearPoints.locationCoord.STDistance(#ms_at)) AS 'Rank' from ( select r.id, r.placeName, r.locationCoord from WiFi r where r.locationCoord.STIntersects(#ms_at.STBuffer(10000)) = 1 ) nearPoints