How to do a two-step lookup using jq? - json

My bank provides a very poor excuse for transaction export which makes doing reconciling receipts for taxes way more difficult than it needs be, fortunately their web site is an SPA that is chock-full of rich json formatted data.
Having extracted all of the transaction data for a year, I'd now like to transform that data into filenames (date - cat - desc - amt.pdf) to use for receipt scans to help speed up the reconciling process.
I'm close to being able to do this but a doing a double lookup has stumped me and I could use the help of someone more well versed in jq.
Sample transaction data
Here's a sample set of the data (many fields omitted for privacy).
export START_YEAR=2020
cat ${START_YEAR}-transaction.json | jq -c -r \
'{ transaction_date: .transaction_date | capture("(?<x>[0-9,-]{10})") | join("-"), id: .id, category_id: .category_id, description: .description }' \
| tail -4
{"transaction_date":"2020-01-01","id":365707178,"category_id":113,"description":"PAYPAL *JOES DC"}
{"transaction_date":"2020-01-02","id":366592530,"category_id":84,"description":"SANSOTEI RAMEN"}
{"transaction_date":"2020-01-02","id":365963458,"category_id":84,"description":"RURU RESTAURANT"}
{"transaction_date":"2020-01-09","id":367661534,"category_id":211,"description":"PAYMENT - THANK YOU"}
{"transaction_date":"2020-01-12","id":368954730,"category_id":35,"description":"PEARSON PARKING T3"}
{"transaction_date":"2020-03-03","id":383492950,"category_id":84,"description":"WVRST"}
{"transaction_date":"2020-03-18","id":387334378,"category_id":113,"description":"WWW.ALIEXPRESS.COM LONDON GBR"}
{"transaction_date":"2020-03-27","id":389293681,"category_id":91,"description":"PAYPAL *CORGI TECH GBR"}
{"transaction_date":"2020-10-09","id":445515569,"category_id":52,"description":"GOODLIFE FITNESS"}
{"transaction_date":"2020-10-27","id":448912859,"category_id":211,"description":"Credit Card Payment"}
{"transaction_date":"2020-12-18","id":466667441,"category_id":64,"description":"KICKSTARTER: A TECHNOL SHAM SHUI PO KOW"}
{"transaction_date":"2020-12-31","id":469463176,"category_id":4,"description":"Interest Paid"}
Lookup #1: category id to translation tag
First the category_id is looked up in following categories data set to extract out a category.key which is a translation tag (i.e. category_id 113 is electronicsComputers)
categories.json (extracted from angular app file - app.dc68cc71460569631f28.js)
{"1":{"id":1,"icon":"icon-income","key":"income","parentCategoryId":null,"color":"#609b02"},"2":{"id":2,"icon":"icon-cheque","key":"payrollorEmploymentIncome","parentCategoryId":1},"4":{"id":4,"icon":"icon-interest","key":"interestCapitalIncome","parentCategoryId":1},"5":{"id":5,"icon":"icon-income-tax-94","key":"taxRefund","parentCategoryId":1},"13":{"id":13,"icon":"icon-home","key":"home","parentCategoryId":null,"color":"#335400"},"14":{"id":14,"icon":"r","key":"groceries","parentCategoryId":116},"16":{"id":16,"icon":"icon-lightbulb","key":"electricityHeatingUtilities","parentCategoryId":13},"17":{"id":17,"icon":"icon-home-insurance","key":"homeInsurancePremiums","parentCategoryId":307},"18":{"id":18,"icon":"icon-tv","key":"tVPhoneInternet","parentCategoryId":13},"20":{"id":20,"icon":"icon-cleaning","key":"homeCleaning","parentCategoryId":13},"21":{"id":21,"icon":"icon-rent-paid","key":"rentPaid","parentCategoryId":13},"23":{"id":23,"icon":"s","key":"furnishingsAppliancesDécor","parentCategoryId":13},"24":{"id":24,"icon":"icon-art","key":"flowersCandlesArtSmallItems","parentCategoryId":64},"27":{"id":27,"icon":"icon-mortgage","key":"mortgage","parentCategoryId":13},"28":{"id":28,"icon":"icon-property-taxes","key":"propertyTaxes","parentCategoryId":13},"29":{"id":29,"icon":"c","key":"homeImprovementsRepairs","parentCategoryId":13},"32":{"id":32,"icon":"icon-car","key":"carsTransportation","parentCategoryId":null,"color":"#0064d8"},"33":{"id":33,"icon":"icon-auto-insurance","key":"autoInsurancePremiums","parentCategoryId":307},"35":{"id":35,"icon":"icon-car","key":"parking","parentCategoryId":32},"36":{"id":36,"icon":"icon-gas","key":"gasFuel","parentCategoryId":32},"37":{"id":37,"icon":"icon-wrench","key":"autoMaintenanceExpenses","parentCategoryId":32},"38":{"id":38,"icon":"icon-auto-loan","key":"autoLoanLeasePayment","parentCategoryId":32},"39":{"id":39,"icon":"icon-bus","key":"taxisPublicTransportation","parentCategoryId":32},"40":{"id":40,"icon":"icon-family","key":"family","parentCategoryId":null,"color":"#335400"},"41":{"id":41,"icon":"icon-baby","key":"childcare","parentCategoryId":40},"42":{"id":42,"icon":"icon-childrens-clothes","key":"children'sClothing","parentCategoryId":40},"43":{"id":43,"icon":"icon-childrens-toys","key":"children'sToys","parentCategoryId":40},"46":{"id":46,"icon":"icon-paddleball","key":"children'sRecreationalActivities","parentCategoryId":40},"48":{"id":48,"icon":"icon-health-and-beauty","key":"healthBeauty","parentCategoryId":null,"color":"#048476"},"49":{"id":49,"icon":"l","key":"drugStoresandPharmacies","parentCategoryId":48},"50":{"id":50,"icon":"icon-health-insurance","key":"lifeHealthInsurancePremiums","parentCategoryId":307},"51":{"id":51,"icon":"icon-bandaid","key":"doctorsHealthcare","parentCategoryId":48},"52":{"id":52,"icon":"icon-sports","key":"sports","parentCategoryId":70},"54":{"id":54,"icon":"icon-loans-taxes-fines","key":"loansTaxesandFines","parentCategoryId":null,"color":"#4c7b02"},"55":{"id":55,"icon":"icon-fines","key":"fines","parentCategoryId":54},"59":{"id":59,"icon":"icon-bank-fees","key":"bankFeesServiceFees","parentCategoryId":54},"61":{"id":61,"icon":"icon-media","key":"subscriptionsMedia","parentCategoryId":64},"64":{"id":64,"icon":"icon-shopping","key":"shoppingServices","parentCategoryId":null,"color":"#7a18c2"},"65":{"id":65,"icon":"icon-clothing","key":"clothingShoes","parentCategoryId":64},"66":{"id":66,"icon":"icon-heart","key":"beautyProducts","parentCategoryId":48},"67":{"id":67,"icon":"icon-scissors","key":"barberHairdressingSalon","parentCategoryId":48},"69":{"id":69,"icon":"icon-iron","key":"dryCleaningClothingRepairs","parentCategoryId":64},"70":{"id":70,"icon":"icon-leisure","key":"leisure","parentCategoryId":null,"color":"#7a18c2"},"71":{"id":71,"icon":"icon-fast-food","key":"fastFoodQuickService","parentCategoryId":116},"72":{"id":72,"icon":"icon-spa","key":"spaMassagePersonalCare","parentCategoryId":48},"76":{"id":76,"icon":"icon-membership","key":"memberships","parentCategoryId":70},"79":{"id":79,"icon":"icon-beer","key":"barsPubsNightclubs","parentCategoryId":70},"81":{"id":81,"icon":"icon-martini","key":"alcoholBars","parentCategoryId":116},"82":{"id":82,"icon":"icon-champagne","key":"tobaccoAlcohol","parentCategoryId":64},"84":{"id":84,"icon":"icon-restaurant","key":"restaurants","parentCategoryId":116},"86":{"id":86,"icon":"icon-education","key":"education","parentCategoryId":null,"color":"#02685d"},"87":{"id":87,"icon":"icon-student-loans","key":"studentLoanPayment","parentCategoryId":86},"88":{"id":88,"icon":"icon-graduation","key":"tuitionCourses","parentCategoryId":86},"89":{"id":89,"icon":"icon-papers","key":"schoolBooksMaterialsStationery","parentCategoryId":86},"91":{"id":91,"icon":"icon-gift","key":"gifts","parentCategoryId":64},"95":{"id":95,"icon":"icon-party","key":"partiesCelebrations","parentCategoryId":64},"96":{"id":96,"icon":"icon-plane","key":"vacationTravel","parentCategoryId":null,"color":"#0064d8"},"97":{"id":97,"icon":"icon-train","key":"transportationCarRental","parentCategoryId":96},"98":{"id":98,"icon":"d","key":"hotelsAccommodation","parentCategoryId":96},"99":{"id":99,"icon":"icon-surfboard","key":"recreationEntertainmentonVacation","parentCategoryId":96},"100":{"id":100,"icon":"icon-alcoholic-drink","key":"foodLivingExpensesonVacation","parentCategoryId":96},"101":{"id":101,"icon":"icon-savings","key":"investmentsSavings","parentCategoryId":null,"color":"#38629c"},"102":{"id":102,"icon":"u","key":"regularSavings","parentCategoryId":101},"104":{"id":104,"icon":"icon-donations-and-charities","key":"charityDonations","parentCategoryId":null,"color":"#bc2e8d"},"105":{"id":105,"icon":"icon-pets","key":"pets","parentCategoryId":40},"106":{"id":106,"icon":"icon-trailer","key":"summerHome/Cottage/Trailer","parentCategoryId":13},"828":{"id":828,"icon":"icon-condo-fees","key":"condoFees","parentCategoryId":13},"830":{"id":830,"icon":"icon-tangerine-rewards","key":"tangerineMoneyBackRewards","parentCategoryId":1},"831":{"id":831,"icon":"icon-uncategorized-expenses","key":"uncategorizedExpenses","parentCategoryId":null,"color":"#757575"},"832":{"id":832,"icon":"u","key":"goalSavings","parentCategoryId":101},"833":{"id":833,"icon":"u","key":"recipes","parentCategoryId":101},"113":{"id":113,"icon":"icon-computer","key":"electronicsComputers","parentCategoryId":64},"114":{"id":114,"icon":"icon-jewelry","key":"jewelleryAccessories","parentCategoryId":64},"115":{"id":115,"icon":"icon-toll","key":"tolls","parentCategoryId":32},"116":{"id":116,"icon":"p","key":"foodDining","parentCategoryId":null,"color":"#9339d4"},"117":{"id":117,"icon":"icon-carwash","key":"carCleaning","parentCategoryId":32},"118":{"id":118,"icon":"icon-rattle","key":"supportPayment","parentCategoryId":40},"121":{"id":121,"icon":"icon-sin","key":"socialInsurance","parentCategoryId":1},"124":{"id":124,"icon":"icon-loan","key":"loanPayments","parentCategoryId":54},"129":{"id":129,"icon":"icon-pacifier","key":"childcareProducts","parentCategoryId":40},"131":{"id":131,"icon":"icon-emergency-savings","key":"emergencyFund","parentCategoryId":101},"143":{"id":143,"icon":"icon-other-pension","key":"otherPension","parentCategoryId":1},"145":{"id":145,"icon":"icon-rental-income","key":"rentalIncome","parentCategoryId":1},"199":{"id":199,"icon":"icon-compass","key":"outdoorActivities","parentCategoryId":70},"202":{"id":202,"icon":"icon-paintbrush","key":"artsCraftsMusic","parentCategoryId":70},"210":{"id":210,"icon":"icon-exclude-from-budget","key":"excludefromBudget","parentCategoryId":null,"color":"#d50179"},"211":{"id":211,"icon":"icon-credit-card-payment","key":"creditCardPayments","parentCategoryId":210},"212":{"id":212,"icon":"icon-transfers","key":"transfersBetweenownAccounts","parentCategoryId":210},"216":{"id":216,"icon":"icon-cash-withdrawal","key":"aBMOtherCashWithdrawals","parentCategoryId":831},"258":{"id":258,"icon":"icon-dice","key":"lotteriesGambling","parentCategoryId":70},"259":{"id":259,"icon":"icon-glasses","key":"eyeCare","parentCategoryId":48},"268":{"id":268,"icon":"icon-uncategorized-income","key":"uncategorizedIncome","parentCategoryId":1},"283":{"id":283,"icon":"icon-income-tax-89","key":"incomeTaxPaid","parentCategoryId":54},"289":{"id":289,"icon":"icon-headphones","key":"musicandApps","parentCategoryId":64},"292":{"id":292,"icon":"icon-savings-transfers","key":"savingsAccountTransactions","parentCategoryId":210},"298":{"id":298,"icon":"icon-pocket-money","key":"allowancePocketMoneyEtc","parentCategoryId":40},"307":{"id":307,"icon":"icon-insurance","key":"insurance","parentCategoryId":null,"color":"#9b53cb"},"315":{"id":315,"icon":"icon-retirement-savings","key":"retirementSavings","parentCategoryId":101},"801":{"id":801,"icon":"icon-support-income","key":"supportIncome","parentCategoryId":1},"802":{"id":802,"icon":"icon-coffee","key":"coffeeShops","parentCategoryId":116},"803":{"id":803,"icon":"icon-desk-1","key":"homeOffice","parentCategoryId":13},"804":{"id":804,"icon":"icon-dependants","key":"dependants","parentCategoryId":40},"805":{"id":805,"icon":"icon-tooth","key":"dental","parentCategoryId":48},"806":{"id":806,"icon":"icon-bottle","key":"nursingCareFacilities","parentCategoryId":48},"807":{"id":807,"icon":"icon-sports-apparel","key":"sportsApparel","parentCategoryId":64},"808":{"id":808,"icon":"icon-pens","key":"schoolSupplies","parentCategoryId":64},"809":{"id":809,"icon":"m","key":"entertainment","parentCategoryId":70},"810":{"id":810,"icon":"icon-school-bus","key":"fieldtripsMiscellaneousExpenses","parentCategoryId":86},"811":{"id":811,"icon":"icon-alcoholic-drink","key":"travelAgenciesTourOperators","parentCategoryId":96},"812":{"id":812,"icon":"icon-signpost","key":"souvenirsGifts","parentCategoryId":96},"813":{"id":813,"icon":"icon-securities","key":"securities","parentCategoryId":101},"814":{"id":814,"icon":"icon-retirement-savings","key":"rRSPs/RSPs","parentCategoryId":101},"815":{"id":815,"icon":"icon-savings","key":"tFSAs","parentCategoryId":101},"816":{"id":816,"icon":"icon-trust-fund","key":"trustFund","parentCategoryId":101},"817":{"id":817,"icon":"icon-education","key":"rESPs","parentCategoryId":101},"818":{"id":818,"icon":"icon-travel-insurance","key":"travelInsurancePremiums","parentCategoryId":307},"819":{"id":819,"icon":"icon-marine-insurance","key":"marineInsurancePremiums","parentCategoryId":307},"820":{"id":820,"icon":"icon-pet-insurance","key":"petInsurancePremiums","parentCategoryId":307},"821":{"id":821,"icon":"icon-charities","key":"charities","parentCategoryId":104},"822":{"id":822,"icon":"icon-donations","key":"donations","parentCategoryId":104},"823":{"id":823,"icon":"icon-credit-card-other","key":"nonTangerineCreditCardPayments","parentCategoryId":831},"824":{"id":824,"icon":"icon-weights","key":"gymSports","parentCategoryId":48},"826":{"id":826,"icon":"icon-tangerine-loan-payment","key":"tangerineLoanPayments","parentCategoryId":210},"827":{"id":827,"icon":"icon-tangerine-mortgage-payment","key":"tangerineMortgagePayments","parentCategoryId":210},"829":{"id":829,"icon":"icon-interest-charge","key":"interestCharge","parentCategoryId":54},"112":{"id":112,"icon":"icon-question2","key":"uncategorized","parentCategoryId":null,"color":"#757575"}}
Lookup #2: translation tag to natural language description
Next the translation tag obtained from step #1 (i.e. electronicsComputers) is looked up in a file called en_CA.json (or fr_CA.json if using the site in French), this gives the desired natural language description: Electronics & Computers
{"OMITTED_OTHER_STUFF":{},"global":{"OMITTED_OTHER_STUFF":{},"transactionsCategories":{"income":"Income","payrollorEmploymentIncome":" Payroll or Employment Income","interestCapitalIncome":"Interest & Capital Income","taxRefund":"Tax Refund","home":"Home","groceries":" Groceries","electricityHeatingUtilities":" Electricity & Heating (Utilities)","homeInsurancePremiums":" Home Insurance Premiums","tVPhoneInternet":"TV, Phone & Internet","homeCleaning":"Home Cleaning","rentPaid":"Rent Paid","furnishingsAppliancesDécor":"Furnishings, Appliances & Décor","flowersCandlesArtSmallItems":"Flowers, Candles, Art & Small Items","mortgage":"Mortgage","propertyTaxes":"Property Taxes","homeImprovementsRepairs":"Home Improvements & Repairs","carsTransportation":"Cars & Transportation","autoInsurancePremiums":"Auto Insurance Premiums","parking":" Parking","gasFuel":"Gas & Fuel","autoMaintenanceExpenses":"Auto Maintenance & Expenses ","autoLoanLeasePayment":"Auto Loan & Lease Payment","taxisPublicTransportation":"Taxis & Public Transportation","family":"Family","childcare":" Childcare","children'sClothing":"Children's Clothing","children'sToys":"Children's Toys","children'sRecreationalActivities":"Children's Recreational Activities","healthBeauty":"Health & Beauty","drugStoresandPharmacies":" Drug Stores and Pharmacies","lifeHealthInsurancePremiums":"Life & Health Insurance Premiums","doctorsHealthcare":"Doctors & Healthcare","sports":"Sports","loansTaxesandFines":"Loans, Taxes, and Fines","fines":" Fines","bankFeesServiceFees":"Bank Fees & Service Fees","subscriptionsMedia":"Subscriptions & Media","shoppingServices":"Shopping & Services","clothingShoes":"Clothing & Shoes","beautyProducts":"Beauty Products","barberHairdressingSalon":"Barber & Hairdressing Salon","dryCleaningClothingRepairs":"Dry Cleaning & Clothing Repairs","leisure":"Leisure ","fastFoodQuickService":"Fast Food (Quick Service)","spaMassagePersonalCare":"Spa, Massage & Personal Care","memberships":" Memberships","barsPubsNightclubs":"Bars, Pubs & Nightclubs ","alcoholBars":"Alcohol & Bars ","tobaccoAlcohol":"Tobacco & Alcohol","restaurants":"Restaurants ","education":"Education","studentLoanPayment":" Student Loan Payment","tuitionCourses":"Tuition & Courses","schoolBooksMaterialsStationery":"School Books, Materials & Stationery","gifts":"Gifts","partiesCelebrations":"Parties & Celebrations","vacationTravel":"Vacation & Travel","transportationCarRental":" Transportation & Car Rental","hotelsAccommodation":"Hotels & Accommodation","recreationEntertainmentonVacation":"Recreation & Entertainment on Vacation","foodLivingExpensesonVacation":"Food & Living Expenses on Vacation","investmentsSavings":"Investments & Savings","regularSavings":" Regular Savings","goalSavings":"Goals","charityDonations":"Charity & Donations","pets":"Pets","summerHome/Cottage/Trailer":"Summer Home/ Cottage/Trailer ","uncategorizedExpenses":"Miscellaneous Expenses","electronicsComputers":"Electronics & Computers","jewelleryAccessories":"Jewellery & Accessories","tolls":"Tolls","foodDining":"Food & Dining","carCleaning":"Car Cleaning","supportPayment":"Support Payment","socialInsurance":"Payments from Government","loanPayments":"Loan Payments","childcareProducts":"Childcare Products","emergencyFund":"Emergency Fund","otherPension":"Other Pension","rentalIncome":"Rental Income","outdoorActivities":"Outdoor Activities ","artsCraftsMusic":"Arts, Crafts & Music","excludefromBudget":"Exclude from Budget","creditCardPayments":" Credit Card Payments","transfersBetweenownAccounts":"Transfers Between own Accounts","aBMOtherCashWithdrawals":" ABM & Other Cash Withdrawals","uncategorizedTransfers":"Uncategorized Transfers","lotteriesGambling":"Lotteries & Gambling","eyeCare":"Eye Care","uncategorizedIncome":"Miscellaneous Income","incomeTaxPaid":"Income Tax Paid","musicandApps":"Music and Apps","savingsAccountTransactions":"Savings Account Transactions","allowancePocketMoneyEtc":"Allowance, Pocket Money, Etc.","insurance":"Insurance","retirementSavings":"Retirement Savings","supportIncome":"Support Income","coffeeShops":"Coffee Shops","homeOffice":"Home Office","dependants":"Dependants","dental":"Dental","nursingCareFacilities":"Nursing & Care Facilities","sportsApparel":"Sports Apparel","schoolSupplies":"School Supplies","entertainment":"Entertainment","fieldtripsMiscellaneousExpenses":"Fieldtrips & Miscellaneous Expenses","travelAgenciesTourOperators":"Travel Agencies & Tour Operators","souvenirsGifts":"Souvenirs & Gifts ","securities":"Securities","rRSPs/RSPs":"RRSPs/RSPs","tFSAs":"TFSAs","trustFund":"Trust Fund","rESPs":"RESPs","travelInsurancePremiums":"Travel Insurance Premiums","marineInsurancePremiums":"Marine Insurance Premiums","petInsurancePremiums":"Pet Insurance Premiums","charities":" Charities","donations":"Donations","gymSports":"Gym & Sports","uncategorized":"Uncategorized","other":"Other","interestCharge":"Interest Charge","tangerineLoanPayments":"Tangerine Loan Payments","tangerineMortgagePayments":"Tangerine Mortgage Payments","tangerineMoneyBackRewards":"Tangerine Money-Back Rewards","condoFees":"Condo Fees","nonTangerineCreditCardPayments":"Non-Tangerine Credit Card Payments","recipes":"Money Rules"}}}
I believe the logic would be something like this:
jq --argfile cat_lookup categories.json --argfile cat_word en_CA.json \
'if $cat_lookup.transactionsCategories.id == .category_id then .cat_desc = $cat_word[$cat_lookup.key] end' \
${START_YEAR}-transaction.json | head
jq: error: syntax error, unexpected end (Unix shell quoting issues?) at <top-level>, line 1:
if $cat_lookup.transactionsCategories.id == .category_id then .cat_desc = $cat_word[$cat_lookup.key] end
jq: error: Possibly unterminated 'if' statement at <top-level>, line 1:
if $cat_lookup.transactionsCategories.id == .category_id then .cat_desc = $cat_word[$cat_lookup.key] end
jq: 2 compile errors
Obviously this fails because my syntax is bad and there needs to be a search each for of all $cat_lookup.transactionsCategories and $cat_word values to match everything up...
what is the best way to accomplish that with jq?

Actually it's fairly simple references:
jq --argfile cat_lookup categories.json --argfile cat_word en_CA.json '
.category_key = $cat_lookup[.category_id | tostring].key
| .category_desc = $cat_word.global.transactionsCategories[.category_key]
' ${START_YEAR}-transaction.json
producing
{
"transaction_date": "2020-01-01",
"id": 365707178,
"category_id": 113,
"description": "PAYPAL *JOES DC",
"category_key": "electronicsComputers",
"category_desc": "Electronics & Computers"
}
{
"transaction_date": "2020-01-02",
"id": 366592530,
"category_id": 84,
"description": "SANSOTEI RAMEN",
"category_key": "restaurants",
"category_desc": "Restaurants "
}
...
Or in one step (without storing the category key):
jq --argfile cat_lookup categories.json --argfile cat_word en_CA.json '
.category_desc = $cat_word.global.transactionsCategories[
$cat_lookup[.category_id | tostring].key
]
' ${START_YEAR}-transaction.json
Note: The jq manual discourages you from using --argfile and suggests to use --slurpfile instead. This would slightly change the way you have to access your variables, as they would become an array containing the actual object as single element. Thus, with --slurpfile use $cat_lookup[0] and $cat_word[0] instead of $cat_lookup and $cat_word, respectively.

Related

Load CSV file in single variant column in snowflake table

Have 100 csv unstructured files and need to load data in single variant column. code posted below will create two rows if two rows are present in the file and requirement is create single row to store data from two rows. what changed I can make in the code?
Table will contain data in DATA column
CREATE OR REPLACE TABLE rtf_lines
(
LOADED_AT timestamp,
FILENAME string,
FILE_ROW_NUMBER int,
DATA VARIANT
);
copy data into table and JSON object to support up to 20 CSV columns, it can be extended
COPY INTO rtf_lines
from
(
SELECT
CURRENT_TIMESTAMP as LOADED_AT,
METADATA$FILENAME as FILENAME,
METADATA$FILE_ROW_NUMBER as FILE_ROW_NUMBER,
object_construct(
'col_001', T.$1, 'col_002', T.$2, 'col_003', T.$3, 'col_004', T.$4,
'col_005', T.$5, 'col_006', T.$6, 'col_007', T.$7, 'col_008', T.$8,
'col_009', T.$9, 'col_010', T.$10, 'col_011', T.$11, 'col_012', T.$12,
'col_013', T.$13, 'col_014', T.$14, 'col_015', T.$15, 'col_016', T.$16,
'col_017', T.$17, 'col_018', T.$18, 'col_019', T.$19, 'col_020', T.$20
) as data
FROM #%rtf_lines T
)
FILE_FORMAT =
(
TYPE = JSON
RECORD_DELIMITER = '\n'
ESCAPE_UNENCLOSED_FIELD = NONE
FIELD_OPTIONALLY_ENCLOSED_BY='0x22'
EMPTY_FIELD_AS_NULL=FALSE
);
Code will output as:-
Row 1
LOADED_AT 2022-06-02 06:09:57.363
FILENAME #RTF_LINES/ui1654167360506/rtf_snowflake_sample.csv
FILE_ROW_NUMBER 1
DATA { "col_001": "NDTV.com provides latest news", "col_002": " videos from India and the world. Get today’s news headlines from Business", "col_003": " " }
Row 2
LOADED_AT 2022-06-02 06:09:57.363
FILENAME #RTF_LINES/ui1654167360506/rtf_snowflake_sample.csv
FILE_ROW_NUMBER 2
DATA { "col_001": "Technology", "col_002": " Sports", "col_003": " Movies", "col_004": " videos", "col_005": " photos", "col_006": " live news coverage and exclusive breaking news from India.}
Output expected as :-
Row 1
LOADED_AT 2022-06-02 06:09:57.363
FILENAME #RTF_LINES/ui1654167360506/rtf_snowflake_sample.csv
FILE_ROW_NUMBER 1
DATA { "col_001": "NDTV.com provides latest news", "col_002": " videos from India and the world. Get today’s news headlines from Business", "col_003": " ",
"col_004": "Technology", "col_005": " Sports", "col_006": " Movies", "col_007": " videos", "col_008": " photos", "col_009": " live news coverage and exclusive breaking news from India.}

Latex table from CSV file

The following code for creating Latex table from csv file is missing horizontal line on top of the table.
\documentclass{article}
\usepackage{csvsimple}
% Make csv in question
\begin{filecontents*}{check.csv}
labels,names,A,C,V,tools
a,example,838,663,683,
b,otter,353,215,192,
d,\textbf{broccoli},79,79,117,
e,fibredensityandcrosssection,1086,849,868,
ad,hcp-prefree:exec-centos7.freebuild-centos4-latest,70,76,157,
ar,shots47s\_fmriprep-1.2.3,,,,453
\end{filecontents*}
\begin{table*}[!ht]
\csvreader[%
tabular={|c|c|c|c|c|c|},
table head = \textbf{Labels} &\textbf{{names}} & \textbf{A} & \textbf{C} & \textbf{V} & \textbf{T}\\\hline,
late after line= \\,
late after last line=\\\hline %
]{check.csv}{labels=\labels,names=\names,A=\A,C=\C,V=\V,tools=\tools}%
{\labels & \names & \A & \C & \V & \tools}
\centering
\caption{\label{table1}Number by category}
\end{table*}
\end{document}
How we can add top horizontal line on top of table?
Please note that building such a data prison is bad style. Have a look at http://betterposters.blogspot.de/2012/08/the-data-prison.html , https://www.inf.ethz.ch/personal/markusp/teaching/guides/guide-tables.pdf or https://wiert.me/2014/04/03/andre-vatter-google-wie-tabellen-eigentlich-aussehen-sollten-%EF%BB%BF/ for some guides about nice looking tables.
\documentclass{article}
\usepackage{csvsimple}
% Make csv in question
\begin{filecontents*}{check.csv}
labels,names,A,C,V,tools
a,example,838,663,683,
b,otter,353,215,192,
d,\textbf{broccoli},79,79,117,
e,fibredensityandcrosssection,1086,849,868,
ad,hcp-prefree:exec-centos7.freebuild-centos4-latest,70,76,157,
ar,shots47s\_fmriprep-1.2.3,,,,453
\end{filecontents*}
\begin{document}
\begin{table*}[!ht]
\csvreader[%
tabular={|c|c|c|c|c|c|},
table head = \hline\textbf{Labels} &\textbf{{names}} & \textbf{A} & \textbf{C} & \textbf{V} & \textbf{T}\\\hline,
late after line= \\,
late after last line=\\\hline %
]{check.csv}{labels=\labels,names=\names,A=\A,C=\C,V=\V,tools=\tools}%
{\labels & \names & \A & \C & \V & \tools}
\centering
\caption{\label{table1}Number by category}
\end{table*}
\end{document}

jq - How to extract domains and remove duplicates

Given the following json:
Full file here: https://pastebin.com/Hzt9bq2a
{
"name": "Visma Public",
"domains": [
"accountsettings.connect.identity.stagaws.visma.com",
"admin.stage.vismaonline.com",
"api.home.stag.visma.com",
"api.workbox.dk",
"app.workbox.dk",
"app.workbox.co.uk",
"authz.workbox.dk",
"connect.identity.stagaws.visma.com",
"eaccounting.stage.vismaonline.com",
"eaccountingprinting.stage.vismaonline.com",
"http://myservices-api.stage.vismaonline.com/",
"identity.stage.vismaonline.com",
"myservices.stage.vismaonline.com"
]
}
How can I transform the data to the below. Which is, to identify the domains in the format of site.SLD.TLD present and then remove the duplication of them. (Not including the subdomains, protocols or paths as illustrated below.)
{
"name": "Visma Public",
"domains": [
"workbox.co.uk",
"workbox.dk",
"visma.com",
"vismaonline.com"
]
}
I would like to do so in jq as that is what I've used to wrangled the data into this format so far, but at this stage any solution that I can run on Debian (I'm using bash) without any extraneous tooling ideally would be fine.
I'm aware that regex can be used within jq so I assume the best way is to regex out the domain and then pipe to unique however I'm unable to get anything working so far I'm currently trying this version which seems to me to need only the text transformation stage adding in somehow either during the jq process or with a run over with something like awk after the event perhaps:
jq '[.[] | {name: .name, domain: [.domains[]] | unique}]' testfile.json
This appears to be useful: https://github.com/stedolan/jq/issues/537
One solution was offered which does a regex match to extract the last two strings separated by . and call the unique function on that & works up to a point but doesn't cover site.SLD.TLD that has 2 parts. Like google.co.uk would return only co.uk with this jq for example:
jq '.domains |= (map(capture("(?<x>[[:alpha:]]+).(?<z>[[:alpha:]]+)(.?)$") | join(".")) | unique)'
A programming language is much more expressive than jq.
Try the following snippet with python3.
import json
import pprint
import urllib.request
from urllib.parse import urlparse
import os
def get_tlds():
f = urllib.request.urlopen("https://publicsuffix.org/list/effective_tld_names.dat")
content = f.read()
lines = content.decode('utf-8').split("\n")
# remove comments
tlds = [line for line in lines if not line.startswith("//") and not line == ""]
return tlds
def extract_domain(url, tlds):
# get domain
url = url.replace("http://", "").replace("https://", "")
url = url.split("/")[0]
# get tld/sld
parts = url.split(".")
suffix1 = parts[-1]
sld1 = parts[-2]
if len(parts) > 2:
suffix2 = ".".join(parts[-2:])
sld2 = parts[-3]
else:
suffix2 = suffix1
sld2 = sld1
# try the longger first
if suffix2 in tlds:
tld = suffix2
sld = sld2
else:
tld = suffix1
sld = sld1
return sld + "." + tld
def clean(site, tlds):
site["domains"] = list(set([extract_domain(url, tlds) for url in site["domains"]]))
return site
if __name__ == "__main__":
filename = "Hzt9bq2a.json"
cache_path = "tlds.json"
if os.path.exists(cache_path):
with open(cache_path, "r") as f:
tlds = json.load(f)
else:
tlds = get_tlds()
with open(cache_path, "w") as f:
json.dump(tlds, f)
with open(filename) as f:
d = json.load(f)
d = [clean(site, tlds) for site in d]
pprint.pprint(d)
with open("clean.json", "w") as f:
json.dump(d, f)
May I offer you achieving the same query with jtc: the same could be achieved in other languages (and of course in jq) - the query is mostly how to come up with the regex to satisfy your ask:
bash $ <file.json jtc -w'<domains>l:>((?:[a-z0-9]+\.)?[a-z0-9]+\.[a-z0-9]+)[^.]*$<R:' -u'{{$1}}' /\
-ppw'<domains>l:><q:' -w'[domains]:<[]>j:' -w'<name>l:'
{
"domains": [
"stagaws.visma.com",
"stage.vismaonline.com",
"stag.visma.com",
"api.workbox.dk",
"app.workbox.dk",
"workbox.co.uk",
"authz.workbox.dk"
],
"name": "Visma Public"
}
bash $
Note: it does extract only DOMAIN.TLD, as per your ask. If you like to extract DOMAIN.SLD.TLD, then the task becomes a bit less trivial.
Update:
Modified solution as per the comment: extract domain.sld.tld where 3 or more levels and domain.tld where there’s only 2
PS. I'm the creator of the jtc - JSON processing utility. This disclaimer is SO requirement.
One of the solutions presented on this page offers that:
A programming language is much more expressive than jq.
It may therefore be worthwhile pointing out that jq is an expressive, Turing-complete programming language, and that it would be as straightforward (and as tedious) to capture all the intricacies of the "Public Suffix List" using jq as any other programming language that does not already provide support for this list.
It may be useful to illustrate an approach to the problem that passes the (revised) test presented in the Q. This approach could easily be extended in any one of a number of ways:
def extract:
sub("^[^:]*://";"")
| sub("/.*$";"")
| split(".")
| (if (.[-1]|length) == 2 and (.[-2]|length) <= 3
then -3 else -2 end) as $ix
| .[$ix : ]
| join(".") ;
{name, domain: (.domains | map(extract) | unique)}
Output
{
"name": "Visma Public",
"domain": [
"visma.com",
"vismaonline.com",
"workbox.co.uk",
"workbox.dk"
]
}
Judging from your example, you don't actually want top-level domains (just one component, e.g. ".com"), and you probably don't really want second-level domains (last two components) either, because some domain registries don't operate at the TLD level. Given www.foo.com.br, you presumably want to find out about foo.com.br, not com.br.
To do that, you need to consult the Public Suffix List. The file format isn't too complicated, but it has support for wildcards and exceptions. I dare say that jq isn't the ideal language to use here — pick one that has a URL-parsing module (for extracting hostnames) and an existing Public Suffix List module (for extracting the domain parts from those hostnames).

Replace single quotes in double quotes in brackets

I must modify a file json. I must replace the single quotes in double quotes but I can't use the following command sed -i -r "s/'/\"/g" file because in the file there are more single quotes that I don't change.
The following code is an example of string:
"categories": [['Clothing, Shoes & Jewelry', 'Girls'], ['Clothing, Shoes & Jewelry', 'Novelty, Costumes & More', 'Costumes & Accessories', 'More Accessories', 'Kids & Baby']]
The desided result should be:
"categories": [["Clothing, Shoes & Jewelry", "Girls"], ["Clothing, Shoes & Jewelry", "Novelty, Costumes & More", "Costumes & Accessories", "More Accessories", "Kids & Baby"]]
sample file:
{"categories": [['Movies & TV', 'Movies']], "title": "Understanding Seizures and Epilepsy DVD"},
{"title": "Who on Earth is Tom Baker?", "salesRank": {"Books": 3843450}, "categories": [['Books']]},
{"categories": [['Clothing, Shoes & Jewelry', 'Girls'], ['Clothing, Shoes & Jewelry', 'Novelty, Costumes & More', 'Costumes & Accessories', 'More Accessories', 'Kids & Baby']], "description": "description, "title": "Mog's Kittens", "salesRank": {"Books": 1760368}}},
{"description": "Three Dr. Suess' Puzzles", "brand": "Dr. Seuss", "categories": [['Toys & Games', 'Puzzles', 'Jigsaw Puzzles']]},
I used a regular expression but the problem is that I don't know how many element are in brackets. So I would a way for replace all single quotes in the brackets, this is a perfect way, but I can not find the solution.
#!/usr/bin/perl -w
use strict;
# read each line from stdin
while (my $l=<>) {
chomp($l); # remove newline char
# split: get contents of innermost square brackets
my #a=split(/(\[[^][]*\])/,$l);
foreach my $i (#a) {
# replace quotes iff innermost square brackets
if ($i=~/^\[/) { $i=~s/'/"/g; }
}
# join and print
print join('',#a)."\n";
}
I found a way to do that, using python.
Note that the json stream you provided is not recognized by python json because of single quotes (and also some copy/paste problems, missing quotes, I fixed that).
My solution is using fully the python libraries, I doubt you can do the same with sed, that's why I provide it despite the fact you didn't mention that technology.
I read the data using ast.literal_eval since it's a list of dictionaries with the exact python syntax. Single quotes are not a problem for ast
I write the data using json.dump. It writes the data using double quotes.
Note that I write it in a "fake" file (i.e. a string with I/O write method to "fool" the json serializer).
Here's a standalone snippet that works:
import io
foo = """[{"categories": [['Movies & TV', 'Movies']], "title": "Understanding Seizures and Epilepsy DVD"},
{"title": "Who on Earth is Tom Baker?", "salesRank": {"Books": 3843450}, "categories": [['Books']]},
{"categories": [['Clothing, Shoes & Jewelry', 'Girls'], ['Clothing, Shoes & Jewelry', 'Novelty, Costumes & More', 'Costumes & Accessories', 'More Accessories', 'Kids & Baby']], "description": "description", "title": "Mog's Kittens", "salesRank": {"Books": 1760368}},
{"description": "Three Dr. Suess' Puzzles",
"brand": "Dr. Seuss", "categories": [['Toys & Games', 'Puzzles', 'Jigsaw Puzzles']]}
]"""
fp = io.StringIO()
json_data=ast.literal_eval(foo)
json.dump(json_data,fp)
print(fp.getvalue())
result:
[{"categories": [["Movies & TV", "Movies"]], "title": "Understanding Seizures and Epilepsy DVD"}, {"salesRank": {"Books": 3843450}, "categories": [["Books"]], "title": "Who on Earth is Tom Baker?"}, {"description": "description", "salesRank": {"Books": 1760368}, "categories": [["Clothing, Shoes & Jewelry", "Girls"], ["Clothing, Shoes & Jewelry", "Novelty, Costumes & More", "Costumes & Accessories", "More Accessories", "Kids & Baby"]], "title": "Mog's Kittens"}, {"brand": "Dr. Seuss", "description": "Three Dr. Suess' Puzzles", "categories": [["Toys & Games", "Puzzles", "Jigsaw Puzzles"]]}]
Here's a full script taking 2 parameters (input file & output file) and performing the conversion. You can use this script within your already existing bash scripts if you're not comfortable with python (save that in fix_quotes.py for instance):
import ast,json,sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file,"r") as fr:
json_data=ast.literal_eval(fr.read())
with open(output_file,"w") as fw:
json.dump(json_data,fw)

R: Extracting elements between characters in a web page

I have two lines of info from a web page that I want to parse into a data.frame.
[104] " $1775 / 2br - 1112ft² - Wonderful two bedroom two bathroom with balcony! (14001 NE 183rd Street )"
[269] " var pID = \"4619136687\";"
I'd like it to look like this.
postID |rent|type|size|description |location
4619136687|1775|2br |1112|Wonderful two bedroom...|14001 NE 183rd Street
I was able to use the sub() command to get the ID but I'm not exactly familiar with regex in the sub() command to parse out what I need when there are spaces, such as in line [104].
sub(".*pID = \"(.*)\";.*","\\1", " var pID = \"4619136687\";")
Any help would be wonderful, Thanks!