Save class items as json file in python scrapy - json

I want to save all data of both these classes (Product_Items and Variant_Product) as JSON output files.
getProductDetails() : In this function I want to extract the data for just 1st element in product_variants list and ading it to the dict(item_list) and for rest of the elements I am creating a req to hit the same function recursively untill I have all the keys in my dict(item_list).
At the end of the function I want to write the extracted data as JSON file, but I can't return two values from a function.
Similarly, in getListingDetails() function I need to save the item as JSON file. PLEASE HELP!!!
Following is the snippet:
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.item import Item, Field
import re,json
class Product_Items(Item):
Image_URL = Field()
Product_Title = Field()
Price = Field()
PPU_Price = Field()
Product_URL = Field()
Product_SKU = Field()
Product_UPC = Field()
class Variant_Product(Item):
Image_URL = Field()
Product_Title = Field()
Price = Field()
PPU_Price = Field()
Product_URL = Field()
Product_SKU = Field()
Product_UPC = Field()
Product_Size = Field()
Meta = Field()
class walmartSpider(scrapy.Spider):
name = "walmart"
start_urls = ['https://www.walmart.com/all-departments']
item_list = {}
def parse(self,response):
reqs = []
base_url='https://www.walmart.com/'
hxs = Selector(text=response.body)
json_response = hxs.xpath('//script[#id="home"]//text()').get()
data = json.loads(json_response)
cat_urls = self.getCategoryUrls(data)
for url in cat_urls:
if url[:7] == '/browse':
url = base_url + url
link=Request(url=url,callback=self.getListingDetails)
reqs.append(link)
return reqs
def getCategoryUrls(self,data):
.....
return final_cat_url
def getListingDetails(self,response):
reqs = []
hxs = Selector(text=response)
data = json.loads(hxs.xpath('//script[#id="searchContent"]//text()').get())
products = data['searchContent']['preso']['items']
item = Product_Items()
for product in products:
item['Image_URL'] = product['imageUrl']
item['Product_Title'] = product['title']
item['Product_URL'] = base_url + product['productPageUrl']
item['Product_SKU'] = product['productId']
item['Product_UPC'] = product['standardUpc'][0]
try:
item['PPU_Price'] = product['primaryOffer']['unitPriceDisplayCondition']
except:
item['PPU_Price'] = ''
try:
regular_price = product['primaryOffer']['offerPrice']
except:
regular_price = ''
if regular_price:
item['Price'] = product['primaryOffer']['offerPrice']
else:
product_req = Request(url=item['Product_URL'],callback=self.getProductDetails)
reqs.append(product_req)
**Want to save this item as JSON file**
**#Pagination**
try:
next_page = data['searchContent']['preso']['pagination']['next']['url']
except:
next_page = ''
if next_page:
next_page_url = str(re.findall(r'^[\S]+\?',response.url)[0])+str(next_page)
req = Request(url=next_page_url,callback=self.getListingDetails)
reqs.append(req)
return reqs
def getProductDetails(self,response):
reqs = []
base_url = 'https://www.walmart.com/ip/'
hxs = Selector(text=response)
variant = Variant_Product()
prod_data = json.loads(hxs.xpath('//script[#id="item"]//text()').get())
product_variants = prod_data['item']['product']['buyBox']['products']
for product_variant in product_variants[1:]:
item_id = product_variant['usItemId']
if item_id not in self.item_list.keys():
self.item_list[item_id] = ''
req = Request(url=base_url+str(item_id),callback=self.getProductDetails)
reqs.append(req)
product_0 = prod_data['item']['product']['buyBox']['products'][0]
variant['Product_Title'] = product_0['productName']
variant['Product_SKU'] = product_0['walmartItemNumber']
variant['Product_UPC'] = product_0['upc']
variant['Product_Size'] = product_0['variants'][0]['value']
variant['Product_URL'] = product_0['canonicalUrl ']
variant['Price'] = product_0['priceMap']['price']
variant['PPU_Price'] = product_0['priceMap']['unitPriceDisplayValue']
variant['Meta'] = (product_0['categoryPath']).replace('Home Page/','')
**Want to save this item as JSON file**
return reqs

According to the scrapy docs, there are several built in "Exporters" that can serialize your data into several different formats (including JSON).
You should be able to do something like:
# ...
from scrapy.exporters import JsonItemExporter
# ...
def getListingDetails(self, response):
# ...
for product in products:
item = Product_Items(
Image_URL = product['imageUrl'],
Product_Title = product['title'],
Product_URL = base_url + product['productPageUrl'],
Product_SKU = product['productId'],
Product_UPC = product['standardUpc'][0],
PPU_Price = product.get('primaryOffer', {}).get('unitPriceDisplayCondition', ''),
Price = product.get('primaryOffer', {}).get('offerPrice', '')
)
if not item['Price']:
product_req = Request(url=item['Product_URL'],callback=self.getProductDetails)
reqs.append(product_req)
JsonItemExporter(open(f"{item['Product_SKU']}.json", "wb")).export_item(item)
Some notes:
The JsonItemExporter.__init__ method expects a file-like object whose write method accepts bytes, which is why the "wb"
dict.get() in Python allows you to specify a default value as the second argument, in case a key doesn't exist (not strictly necessary here, but reduces the try/except logic)
When handling exceptions, it's recommended by PEP8 standards to catch more specific exception types (in the above cases, except KeyError: might be appropriate) than just a bare except clause
Please let me know if the above works for you!

Related

Access/Query List of Object SQLAlchemy

I have a relationship that yields a list of objects
class Category(db.Model):
_id = db.Column("id", db.Integer, primary_key = True)
book_category = db.Column("book_category", db.String)
booklist = db.relationship('Books', backref = "book_category")
def __init__(self, book_category):
self.book_category = book_category
class Books(db.Model):
_id = db.Column("id", db.Integer, primary_key = True)
bookname = db.Column("bookname", db.String)
filename = db.Column("filename", db.String)
category = db.Column(db.Integer, db.ForeignKey('category.book_category'))
def __init__(self, bookname,filename):
self.bookname = bookname
self.filename = filename
So when I'm querying the category I have an access to category.booklist
Which would yield something like this
[<book1>, <book3>, <book4>]
I can access each using for loop however, I'm feeling that it is not the most efficient way to do.
Is there any way that I can do like
category.booklist.query.filter(books.bookname == variable).all()
Querying and filtering the list of objects yielded by category.booklist

how to show manytomany field data in json format - django

I'm trying to show manytomany data in json format(without using serializer), here is my models.py
class CustomerInvoice(models.Model):
customer = models.CharField(max_length=50)
items_model = models.ManyToManyField(Item,through='InvoiceItem')
created_at = models.DateTimeField(auto_now_add=True)
class InvoiceItem(models.Model):
item = models.ForeignKey(Item,on_delete=models.CASCADE)
invoice = models.ForeignKey(CustomerInvoice,on_delete=models.CASCADE,related_name='invoice')
quantity = models.IntegerField()
price = models.DecimalField(max_digits=20,decimal_places=2)
is it possible to make a look up base on many to many data?
something like this : Q(items_model__icontains=query_search) ,and also how to return the M2M data into a json format using values() and json.dumps please? this returns the ID Values('items_model') and this dont work Values('items_model__all')
and here is my views.py
def invoices_all_lists(request):
if request.is_ajax():
query_search = request.GET.get('filter')
if query_search:
all_item_qs = CustomerInvoice.objects.all()
a = []
for i in all_item_qs.items_model.all():
a.append(i.item.name)
invoices = CustomerInvoice.objects.annotate(
total=Sum((F('invoice__quantity') * F('invoice__price')),output_field=DecimalField(decimal_places=2,max_digits=20))
).filter(
Q(id__icontains=query_search) | Q(seller__username__icontains=query_search) |
Q(customer__icontains=query_search)).values(
'id','seller__username','customer','total','created_at','items_model').order_by('-id')
else:
all_item_qs = CustomerInvoice.objects.all()
a = []
for data in all_item_qs:
for i in data.items_model.all():
a.append(i.item.name)
invoices = CustomerInvoice.objects.annotate(
total=Sum((F('invoice__quantity') * F('invoice__price')) ,output_field=DecimalField(decimal_places=2,max_digits=20))
).values(
'id','seller__username','customer','total','created_at','items_model').order_by('-id')
start_from = 0
if request.GET.get('start'):
start_from = int(request.GET.get('start'))
limit = 10
if request.GET.get('limit'):
limit = int(request.GET.get('limit'))
data_lists = []
for index,value in enumerate(invoices[start_from:start_from+limit],start_from):
value['counter'] = index+1
data_lists.append(value)
data = {
'objects':data_lists,
'length':invoices.count(),
}
return HttpResponse(json.dumps(data, indent=4, sort_keys=True, default=str),'application/json')
else:
return redirect('invoiceapp:list-all-invoice')
can i add this part of the code into the query please?
a = []
for data in all_item_qs:
for i in data.items_model.all():
a.append(i.item.name)
note : i've used datatable server side in the client side

Generate n-gram for a specific column present in mysql db

I'm writing a code to generate n-grams for every record in the table by reading a specific column.
def extract_from_db(inp_cust_id):
sql_db = TatDBHelper()
t_sql = "select notes from raw_data where customer_id = {0}"
db_data = sql_db.execute_read(t_sql.format(inp_cust_id))
for row in db_data:
text = row.values()
bi_grams = generate_ngrams(text[0].encode("utf-8"), 2)
print bi_grams
def generate_ngrams(sentence, n):
sentence = sentence.lower()
sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
tokens = [token for token in sentence.split(" ") if token != ""]
ngrams = zip(*[tokens[i:] for i in range(n)])
return [" ".join(ngram) for ngram in ngrams]
I'm getting the output like:
['i highly', 'highly recommend', 'recommend it']
['the penguin', 'penguin encounter', 'encounter was', 'was awesome']
I want the output to look like below, can anybody help me to get this.
['i highly',
'highly recommend',
'recommend it',
...
]
creat another list all_ngrams, and keep appending the values to it , using .extend(), and finally you will have all the ngrams in one list.
Try this :
def extract_from_db(inp_cust_id):
sql_db = TatDBHelper()
t_sql = "select notes from raw_data where customer_id = {0}"
db_data = sql_db.execute_read(t_sql.format(inp_cust_id))
all_ngrams = []
for row in db_data:
text = row.values()
bi_grams = generate_ngrams(text[0].encode("utf-8"), 2)
all_ngrams.extend(bi_grams)
print all_ngrams

Selenium python script working but its not click or entering any value firefox

Script working completely but its not entering any data.
Here my code:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import datetime
from login_credentials import *
from common_file import *
from selenium.webdriver.firefox.options import Options
from pyvirtualdisplay import Display
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
start_time = str(sdate)+" "+ str(stime)
end_time = str(edate)+" "+ str(etime)
options = Options()
options.headless = True
driver = webdriver.Firefox(executable_path='/usr/bin/geckodriver',options=options)
driver.get("https://www.goeventz.com/")
driver.find_element_by_xpath("//a[contains(text(),'Login')]").click()
print("going")
#driver.find_element_by_id("userlogin")
driver.find_element_by_id("user_email").send_keys(ge_email)
driver.find_element_by_id("password").send_keys(ge_pswd)
#driver.find_elements_by_class_name(".btn-login").click()
#driver.find_element_by_css_selector('btn-login').click()
driver.find_element_by_xpath("//button[#type='submit']").click()
driver.find_element_by_xpath("//a[contains(text(),'Create Event') and #id='headerbtn']").click()
driver.find_element_by_name("title").clear()
driver.find_element_by_name("title").send_keys(eventname)
driver.find_element_by_xpath("//a[contains(text(),'Enter Address')]").click()
driver.find_element_by_xpath("//input[contains(#name,'venue_name')]").send_keys(full_address)
driver.find_element_by_name("start_date_time").clear()
driver.find_element_by_name("start_date_time").send_keys(start_time)
driver.find_element_by_name("end_date_time").clear()
driver.find_element_by_name("end_date_time").send_keys(end_time)
driver.find_element_by_id("fileToUpload").send_keys("/var/www/html/crons/event_posting/manual/test.jpg")
driver.find_element_by_xpath("//div[contains(#class,'fr-element fr-view')]").send_keys('description')
select = Select(driver.find_element_by_name("booknow_button_value"))
select.select_by_value('Register')
select = Select(driver.find_element_by_name("category"))
select.select_by_value("Sports")
select = Select(driver.find_element_by_name("othercategory"))
select.select_by_value('Festival')
driver.find_element_by_name("support_mobile").send_keys(cont_number)
driver.find_element_by_name('support_email').send_keys(email_id)
driver.find_element_by_name("makeeventlive").click()
print("its complted")
and it running completly on server, this is output:
but its not entering any data as provided it just output it blank.
here the output im getting on browser:
output on browser
this is common_file:
from dbconnection import get_conn
from datetime import datetime
connection_object, cursor = get_conn()
json_0 = []
json12_in_list = []
json_12 = []
json34_in_list = []
json_34 = []
json5 = []
json678_in_list = []
json_678 = []
json9 = []
json10 = []
main_json = {}
event_details = ''
with open('event_details.txt', 'r') as f:
event_details = f.read()
event_id = int(event_details.split(',')[0])
site_id = int(event_details.split(',')[1])
site_name = str(event_details.split(',')[2])
#event_id =
sql = """SELECT * FROM articles2 WHERE id ='%d'""" %event_id
cursor.execute(sql)
data = cursor.fetchall()
for info in data:
eventid = info[0]
countryname = info[1]
eventname = info[2]
profileimg = info[5]
banner0 = info[6]
sdate = str(info[7])[:10]
edate = str(info[8])[:10]
addr1 = info[9]
addr2 = info[10]
pincode = info[11]
full_address = info[15]
state = info[12]
city = info[13]
stime = str(info[18])
#s_time = datetime.strptime(stime,"%H:%M:%S")
#stime = s_time.strftime("%I:%M:%S %p")
etime = str(info[19])
# e_time = datetime.strptime(etime,"%H:%M:%S")
# etime = e_time.strftime("%I:%M:%S %p")
description = info[20]
src_url = info[26]
json0 = {"event id":eventid, "country":countryname, "event name":eventname, "profile image":profileimg, "banner":banner0, "start date":sdate,
"end date":edate, "address 1":addr1, "address 2":addr2, "pincode":pincode, "full address":full_address, "state":state, "city":city,
"start time":stime, "end time":etime, "description":description, "source url":src_url}
json_0.append(json0)
main_json['event info'] = json_0
#tickets
sql1 = """SELECT * FROM tickets WHERE event_id = '%d'""" %event_id
cursor.execute(sql1)
data1 = cursor.fetchall()
for info1 in data1:
tktid = info1[0]
eventid1 = info1[1]
tktname = info1[2]
original_tkt_price = info1[3]
other_charges = info1[4]
other_charges_type = info1[5]
tkt_qty = info1[6]
min_qty = info1[7]
max_qty = info1[8]
qty_left = info1[9]
ticket_msg = info1[10]
ticket_start_date = str(info1[11])[:10]
ticket_start_time = str(info1[11])[11:]
expiry_date = str(info1[12])[:10]
expiry_time = str(info1[12])[11:]
ticket_label= info1[13]
active1 = info1[14]
..........................................................................

Why 2 queries are executed instead of one?

I have following piece of code:
def detail(request, popular_id):
try:
popular = Popular.objects.get(pk = popular_id)
share = Share.objects.get(isin = popular.isin) #LINE 1
chart_data_json = share.get_chart_data_json()
except Popular.DoesNotExist:
raise Http404
return render(request, 'popular/detail.html', {'popular': popular, 'chart_data': chart_data_json})
In LINE 1 I noticed using debug-toolbar that there are two queries get executed:
SELECT `share_share`.`id`, `share_share`.`symbol`, `share_share`.`isin`, `share_share`.`name`, `share_share`.`market`, `share_share`.`updated` FROM `share_share` WHERE `share_share`.`id` = 1
and
SELECT `share_share`.`id`, `share_share`.`symbol`, `share_share`.`isin`, `share_share`.`name`, `share_share`.`market`, `share_share`.`updated` FROM `share_share` WHERE `share_share`.`isin` = 'US5949181045'
I cannot understand why we need the first query and how to avoid it?
EDIT:
Model definition of share:
class Share(models.Model):
symbol = models.CharField(max_length = 32)
isin = models.CharField(max_length = 12)
name = models.CharField(max_length = 256)
market = models.CharField(max_length = 64)
updated = models.BooleanField(default = False)
def get_chart_data_json(self):
quote_model = create_quote_model(str(self.isin))
data = quote_model.objects.values('date', 'adj_close', 'volume')
chart_data = []
for d in data.iterator():
chart_data.append({'date': d['date'].isoformat(), 'value': d['adj_close'], 'volume': d['volume']})
chart_data_json = json.dumps(chart_data)
return chart_data_json
def __unicode__(self):
return self.isin
Model definition of popular:
class Popular(models.Model):
title = models.CharField(max_length = 120)
text = models.CharField(max_length = 1024)
isin = models.ForeignKey(Share)
def __unicode__(self):
return self.title
First query is evaluated when you access foreign key isin from popular object:
share = Share.objects.get(isin = popular.isin)
Second query gets Share object:
share = Share.objects.get(isin = popular.isin)
If you want just one query at #LINE 1 you should replace it with:
share = popular.isin #LINE 1