How to open multiple json files in Python? - json

I have to open a lot of json files in python. The following code works fine for a small amount of json files. But im already waiting 6 hours now and it's still not working. Im sure there should be a faster way than this.
base_dir = 'All Datasets EDIT/airlinesjson'
json_data_firstmonth2 = pd.DataFrame()
json_data_fmnoreset = pd.DataFrame()
for file in os.listdir(base_dir):
if 'json' in file:
json_path = os.path.join(base_dir, file)
json_data = pd.read_json(json_path, lines=True)
json_data_fmnoreset = pd.concat([json_data_fmnoreset,json_data], sort=False)
json_data_firstmonth2 = json_data_fmnoreset.reset_index()

Try to use this piece of code
json_list = [f for f in os.listdir(base_dir) if f.endswith('.json')]
for i in json_list:
with open(base_dir+ i) as json_file:
data = json.load(json_file)
...

Related

Code Workbooks - File not found using hadoop_path

I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?
This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df

csv file isn't saved in different directory in python

My code reads a bunch of json files from a directory and extract "frequency" and "attenuation" data from those files and write to a csv file. Now I want to save that csv file in a different directory. The code executes without any error but saves in the current directory. Can anyone help to resolve this issue?
import csv
import glob
import json
import os
site = 'alpha'
frequency_to_check = '196050.000'
json_dir_name = 'V:/temp/test/'
json_pattern = os.path.join(json_dir_name, '*.json')
total_files = glob.glob(json_pattern)
atten = []
timestamp = []
save_path = 'V:/python/result/'
if not os.path.isdir(save_path):
os.makedirs(save_path)
filename = f'{site}-{frequency_to_check}.csv'
with open(filename, 'w', newline='') as csv_file:
for file in total_files:
with open(file) as json_file:
output_json = json.load(json_file)
for key in output_json:
if key['start-freq'] == frequency_to_check:
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
save_file = os.path.join(save_path, filename)
csv_file.close()
print(f'Total files processed {len(total_files)}')
The issue as far as I can deduce is here :
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
csv_file is your object that is loaded into memory , and everytime this line is executed you are just writing the rows in the already open file. After that you are just creating a new path :
save_file = os.path.join(save_path, filename)
which is never really used as you close the file too.
To fix this I would suggest that you put save_path as csv file :
import csv
import glob
import json
import os
site = 'alpha'
frequency_to_check = '196050.000'
json_dir_name = 'V:/temp/test/'
json_pattern = os.path.join(json_dir_name, '*.json')
total_files = glob.glob(json_pattern)
atten = []
timestamp = []
save_path = 'V:/python/result/'
if not os.path.isdir(save_path):
os.makedirs(save_path)
filename = f'{site}-{frequency_to_check}.csv'
save_file = os.path.join(save_path, filename)
with open(save_file, 'w', newline='') as csv_file:
for file in total_files:
with open(file) as json_file:
output_json = json.load(json_file)
for key in output_json:
if key['start-freq'] == frequency_to_check:
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
csv_file.close()
print(f'Total files processed {len(total_files)}')
I guess this should work.

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Scraping Data from JSON

How to scrape this data,
http://jsonviewer.stack.hu/#http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181
and Extract only total_time" to a file?
It should be fairly easy to achieve this with a little search.
You just have to find some modules to work with json, dataframes and text files, and learn how to use them.
Steps:
1 - read json data using pandas.from_json()
2 - set data = df['total_time']
2 - write data using pandas.to_csv()
Simple as py.
Documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
import json
json_string = '''Json data here'''
data = json.loads(json_string)
total_time = data["route_summary"]["total_time"]
f = open("file_name_here.txt", "w+")
f.write(str(total_time))
f.close()
I've wrote this program for you:
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
tot_time = str(data['route_summary']['total_time'])
s = tot_time + "\n"
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
with open(outfile, "a+") as f:
f.write(s)
It'll append each observation to the end of outfile.txt
Saving json data to a file and reading that file
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
#saving json to file
with open(outfile, "w") as f:
f.write(str(data))
#reading file with json data
with open(outfile, 'r') as g:
json_data = g.readline()
print json_data
#Output:
{u'route_geometry': u'{_ego#m}|rhBpBaBvHuC`EuArEUtEtAlDvEnD`MlDvMli#hsEfFzn#QlTgNhwCs#fKwBhF', u'status': 0, u'via_indices': [0, 15], u'route_summary': {u'total_time': 101, u'end_point': u'', u'start_point': u'', u'total_distance': 871}, u'route_name': [u'', u''], u'hint_data': {u'checksum': 326195011, u'locations': [u'AXQDAP____8AAAAABwAAABEAAAAYAAAAIwIAAERwAgAAAAAADgyCAef7TAMCAAEB', u'bOsDAP____8AAAAAAwAAAAcAAADFAQAAFAAAAEJwAgAAAAAANQeCAd3dTAMFAAEB']}, u'via_points': [[25.299982, 55.376873], [25.29874, 55.369179]], u'status_message': u'Found route between points', u'found_alternative': False}

Python 3: Opening multiple .csv files

I want to open multiple csv files (with same data types/columns), save the data into one variable do some stuff to data and save it into one csv file. While I can easily open one file, I can't seem to find a way to open multiple files. Here is my code:
import numpy as np
import csv
from collections import Counter
files = ['11.csv', '12.csv', '13.csv', '14.csv', '15.csv']
with open(files) as csvfile:
info = csv.reader(csvfile, delimiter=',')
info_types = []
records = 0
for row in info:
records = row[2]
call_types.append(records)
stats = Counter(call_types).most_common()
print(stats)
results = stats
resultFile = open("Totals.csv",'w')
wr = csv.writer(resultFile, dialect='excel')
for output in results:
wr.writerow(output)
To make it work, simultaneouly less bug prone and efficient try the following.
# required imports
files = ['11.csv', '12.csv', '13.csv', '14.csv', '15.csv']
with open("outfile","wt") as fw:
writer = csv.writer(fw)
for file in files:
with open(file) as csvfile:
info = csv.reader(csvfile, delimiter=',')
info_types = []
records = 0
for row in info:
# process row but don't store it
# in any list if you
# don't have to(that will defeat the purpose)
# say you get processed_row
writer.writerow(processed_row)
I would do this within a loop. Since you are already appending the data as you are reading from the file.
for f in files:
with open(f) as csvfile:
...