I am trying to fetch public holiday data for France, Australia and Germany from https://date.nager.at/ with the documentation at https://date.nager.at/swagger/index.html and store it in a JSON file.
Until now, I have created an endpoint /fetch_and_save with its own URL, but only managed to write dummy data in the views.py file. How do I fetch the data above and store it in a database through Django?
Edit: I have been asked to show the code.
I ran
python3 manage.py startapp fetch_and_save
to create the fetching application, and I have the following URL patterns in storefront/urls.py:
urlpatterns = [
path('admin/', admin.site.urls),
path('fetch_and_save/', include('fetch_and_save.urls')),
]
then I added function in fetch_and_save/views.py:
def say_hello(request):
return HttpResponse('Fetch the data from date.nager.at here')
and I called that in fetch_and_save/urls.py:
urlpatterns = [
path('', views.say_hello)
]
That's it. It's just a hello world app.
Here you have some clues, how to do that in simple way:
fetch_and_save/views.py
import json
import requests
def get_public_holidays(request, year, tag):
response = requests.get(f'https://date.nager.at/api/v2/PublicHolidays/{year}/{tag}')
with open(f'public_holidays_{tag}_{year}.json', 'w') as outfile:
json.dump(response.text, outfile)
...
fetch_and_save/urls.py
urlpatterns = [
path('<int:year>/<str:tag>/', views.get_public_holidays)
]
Then if you add i.e. /2020/DE you will fetch the public holidays 2020 of Germany and save it to the json file with a name public_holidays_DE_2020.json.
Related
I have experienced problem in creating my own page on django. I follow the tutorial but get the different results. The error is page not found and Using the URLconf defined in djangonautic.urls, Django tried these URL patterns, in this order:
admin/
about/
^$
The empty path didn’t match any of these.It would be appreciate if someone can help me:
urls.py
from django.contrib import admin
from django.urls import path
from. import views
urlpatterns = [
path(r'^admin/', admin.site.urls),
path(r'^about/$', views.about),
path(r'^$', views.homepage),
path(r'^$', views.index),
]
views.py
from django.http import HttpResponse
from django.shortcuts import render
def about(request):
return HttpResponse('my name is Jacky')
def homepage(request):
return HttpResponse('welcome home')
def index(request):
return HttpResponse("Hello, world I am the king")
The web page will be display normally, no 404 is found
First, the import in urls.py (in your app directory) should be,
from . import views
Secondly, you do not really need to specify a URL path for the admin page, so better to get rid of that from your urls.py file as that is handled by django in the urls.py file in the appname/url.py file.
Third, ensure that in your projectname/urls.py file, you have included the path to your urls.py file in appname/urls.py file.
from django.contrib import admin
from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('', include('inventory.urls')),
]
I have been tasked to create a method to download multiple PDFs from URLs included in JSON files. Probably 1 URL per JSON file, with approx 500k JSON files to process in any one batch.
Here's a sample of the JSON file:
{
"from": null,
"id": "sfm_c4kjatol7u8psvqfati0",
"imb_code": "897714123456789",
"mail_date": null,
"mail_type": "usps_first_class",
"object": "self_mailer",
"press_proof": "https://lob-assets.com/sid-self_mailers/sfm_c4kjatol7u8psvqfati0.pdf?version=v1&expires=1635274615&signature=AZlb0MSzZPuCjtKFkXRr_OoHzDzEy23UqzmKFWs5bycKCEcIyfe2od58zHzfP1a-iW5d9azFYUT1PnosqKcvBg",
"size": "11x9_bifold",
"target_delivery_date": null,
"to": {
"address_city": "SAN FRANCISCO",
"address_country": "UNITED STATES",
"address_line1": "185 BERRY ST STE 6100",
"address_line2": null,
"address_state": "CA",
"address_zip": "94107-1741",
"company": "Name.COM",
"name": "EMILE ILES"
}
}
The JSON file is converted to CSV and the URL is downloaded.
Here's what I have been trying to use but it is not working. What am I missing?
Import urllib.request, json, requests, os, csvkit
from itertools import islice
from pathlib import Path
path = Path("/Users/MyComputer/Desktop/self_mailers")
paths = [i.path for i in islice(os.scandir(path), 100)]
in2csv data.json > data.csv
with open('*.json', 'r') as f:
urls_dict = json.load(f)
urls_dict = urls_dict[0]
itr = iter(urls_dict)
len(list(itr))
f.write(r.pdf)
Why are you converting your JSON to a CSV?
Btw, if you are unsure of where are the urls in the jsons, I would do this:
import os
import json
from rethreader import Rethreader
from urllib.parse import urlparse
from urllib.request import urlretrieve
def download_pdf(url):
# use urlparse to find the pdf name
filename = urlparse(url).path.rsplit('/')[-1]
urlretrieve(url, filename)
# use multi-threading for faster downloads
downloader = Rethreader(download_pdf).start()
def verify_url(value):
if not isinstance(value, str):
# if the value is not a string, it's neither an url
return False
try:
parsed_url = urlparse(value)
except AttributeError:
# value cannot be parsed as url
return False
if not (parsed_url.scheme and parsed_url.netloc and parsed_url.path):
# value cannot be an url because it does not have the right scheme
return False
return True
def parse_data(data):
for value in data.values():
if verify_url(value):
downloader.add(value)
for file in os.listdir():
with open(file) as fp:
try:
json_data = json.load(fp)
except (json.JSONDecodeError, UnicodeDecodeError):
# this file is not a json; let's skip to the next one
continue
parse_data(json_data)
# quit the downloader after downloading the files
downloader.quit()
If you know in what possible keys can be the urls, I would do as this:
# The other parts same as before
def parse_data(data):
for key in ['possible_key', 'another_possible_key']:
if key in data and verify_url(data[key]):
downloader.add(data[key])
I have 2 json configuration files to read and want to assign there values to variables. I am creating a data flow job using apache beam but unable to parse those files and assign there values to a variable.
config1.json - { "bucket_name": "mybucket"}
config2.json - { "dataset_name": "mydataset"}
This is the pipeline statements ---- I tried with one JSON file first but even that is not working
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser(custom_options.configfile))
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
I also tried creating a function to parse atleast one file. This is a sample method I created but it did not work.
class custom_json_parser(beam.DoFn):
import apache_beam as beam
from apache_beam.io.gcp import gcsio
import logging
def __init__(self, configfile):
self.configfile = configfile
def process(self, configfile):
logging.info("JSON PARSING STARTED")
with beam.io.gcp.gcsio.GcsIO().open(self.configfile, 'r') as f:
for line in f:
data = json.loads(line)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name') ```
Can someone please suggest the best method to resolve this issue in apache beam?
Thanks in Advance
If you need to read only once your files in the pipeline, don't read them in the pipeline, but before running it.
Read the files from GCS
Parse the file and put the useful content in the pipeline options map
Run your pipeline and use the data from the options
EDIT 1
You can use this piece of code to load the file and read it, before your pipeline. Simple Python, standard GCS libraries.
from google.cloud import storage
import json
client = storage.Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.get_blob("name.json")
json_data = blob.download_as_string().decode('UTF-8')
print(json_data) # print -> {"name": "works!!"}
print(json.loads(json_data)["name"]) # print -> works!!
You can try following code snippet: -
Function to Parse File
class custom_json_parser(beam.DoFn):
def process(self, element):
logging.info(element)
data = json.loads(element)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name')
return [{"bucket": bucket , "dataset": dataset }]
Over Pipeline you can call function
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser())
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
It will work.
I am new to Python and Django. I am an IT professional that deploys software that monitors computers. The api outputs to JSON. I want to create a Django app that reads the api and outputs the data to an html page. Where do I get started? I think the idea is to write the JSON feed to a Django model. Any help/advice is greatly appreciated.
Here's a simple single file to extract the JSON data:
import urllib2
import json
def printResults(data):
theJSON = json.loads(data)
for i in theJSON[""]
def main():
urlData = ""
webUrl = urllib2.urlopen(urlData)
if (webUrl.getcode() == 200):
data = webUrl.read()
printResults(data)
else:
print "Received error"
if __name__ == '__main__':
main()
If you have an URL returning a json as response, you could try this:
import requests
import json
url = 'http://....' # Your api url
response = requests.get(url)
json_response = response.json()
Now json_response is a list containing dicts. Let's suppose you have this structure:
[
{
'code': ABC,
'avg': 14.5,
'max': 30
},
{
'code': XYZ,
'avg': 11.6,
'max': 21
},
...
]
You can iterate over the list and take every dict into a model.
from yourmodels import CurrentModel
...
for obj in json_response:
cm = CurrentModel()
cm.avg = obj['avg']
cm.max = obj['max']
cm.code = obj['code']
cm.save()
Or you could use a bulk method, but keep in mind that bulk_create does not trigger save method.
I'm making a pipeline in scrapy to store scraped data in a mysql database. When the spider is run in terminal it works perfectly. Even the pipeline is opened. However the data is not being sent to the database. Any help appreciated! :)
here's the pipeline code:
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
from tutorial.items import TutorialItem
class MySQLTest(object):
def __init__(self):
db = MySQLdb.connect(user='root', passwd='', host='localhost', db='python')
cursor = db.cursor()
def process_item(self, spider, item):
try:
cursor.execute("INSERT INTO info (venue, datez) VALUES (%s, %s)", (item['artist'], item['date']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
and heres the spider code
import scrapy # Import required libraries.
from scrapy.selector import HtmlXPathSelector # Allows for path detection in a websites code.
from scrapy.spider import BaseSpider # Used to create a simple spider to extract data.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor # Needed for the extraction of href links in HTML to crawl further pages.
from scrapy.contrib.spiders import CrawlSpider # Needed to make the crawl spider.
from scrapy.contrib.spiders import Rule # Allows specified rules to affect what the link
import spotipy
import soundcloud
import mysql.connector
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs" # Name of the Spider. In command promt, when in the correct folder, enter "scrapy crawl Allgigs".
allowed_domains = ["www.allgigs.co.uk"] # Allowed domains is a String NOT a URL.
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
] # Specify the starting points for the web crawler.
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'), # Search the start URL's for
callback="parse_me",
follow=True),
]
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]|//div[#class="resultbox"]'):
item = TutorialItem() # Extract items from the items folder.
item ['artist'] = info.xpath('.//span[#class="summary"]//text()').extract() # Extract artist information.
item ['date'] = info.xpath('.//span[#class="dates"]//text()').extract() # Extract date information.
#item ['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract() # Extract end date information.
#item ['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract() # Extract start date information.
item ['genre'] = info.xpath('.//div[#class="header"]//text()').extract()
yield item # Retreive items in item.
client = soundcloud.Client(client_id='401c04a7271e93baee8633483510e263')
tracks = client.get('/tracks', limit=1, license='cc-by-sa', q= item['artist'])
for track in tracks:
print(tracks)
I believe the problem was in my settings.py file where i had missed a comma... yawn.
ITEM_PIPELINES = {
'tutorial.pipelines.MySQLTest': 300,
}