I have a script that is working fine except for this tiny issue. My script is looping over list items and appending a json string over a loop and then doing json dump to file.
My json string:
main_json = {"customer": {"main_address": "","billing_address": "","invoice_reference": "","product": []}}
main loop:
for row in result:
account_id = ACCOUNTID_DATA_CACHE.get(row['customer.main_address.customer_id'])
if account_id is None or account_id != row['customer.main_address.customer_id']:
if main_json:
results.append(main_json)
main_json = {"customer": {"main_address": "","billing_address": "","invoice_reference": "","product": []}}
main_address = {}
billing_address = {}
for key,value in row.items():
if key.startswith('customer.main_address'):
main_address[key.split(".")[2]] = value
if key.startswith('customer.billing_address'):
billing_address[key.split(".")[2]] = value
billing_address_copy = billing_address.copy()
for mkey,mvalue in main_address.items():
for bkey,bvalue in billing_address_copy.items():
if str(bvalue) == str(mvalue):
bvalue = ''
billing_address_copy[bkey] = bvalue
if all(value == '' for value in billing_address_copy.values()) is True:
main_json['customer']['billing_address'] = ''
else:
main_json['customer']['billing_address'] = billing_address
main_json['customer']['main_address'] = main_address
product = parse_products(row)
main_json['customer']['product'].append(product)
...
def parse_products(row):
product = {}
x = {}
for key,value in row.items():
if key.startswith('customer.product'):
product[key.split(".")[2]] = value
if key.startswith('customer.product.custom_attributes'):
x['domain'] = value
print(x)
product[key.split(".")[2]] = x
if key == 'start_date' or 'renewal_date':
value = str(value)
product[key] = value
return product
In this part below, how do make sure that the value is not a string when dumped?
if key.startswith('customer.product.custom_attributes'):
x['domain'] = value
print(x)
product[key.split(".")[2]] = x
Because in the output I'm getting:
{
"custom_attributes": "{'domain': 'somedomain.com'}",
"description": "some_description",
"discount": "0.00"}
When what I really want is:
{
"custom_attributes": {"domain": "somedomain.com"},
"description": "some_description",
"discount": "0.00"}
EDIT: how i'm dumping:
with open('out.json', 'w') as jsonout:
json.dump(main_json, jsonout, sort_keys=True, indent=4)
Well, this IF is flawed and always TRUE:
if key == 'start_date' or 'renewal_date':
So you are converting everything to str()
Related
How to delete empty lists and dictionaries in json row and count them, for example:
[ [ { }, [ ], " ", { {"1":{}, "2":[ ] }, [ [ { } ] , " " ] ] ]
This uses an object to handle the recognition & counting of empty things:
class Counts:
d = 0
l = 0
def count(self,x):
if x==[]:
self.l += 1
return True
elif x=={}:
self.d += 1
return True
return False
def __repr__(self):
return "<d=%d, l=%d>" % (self.d,self.l)
def tr(x):
c = Counts()
new_x = tr2(x,c)
c.count(new_x) # Count final result
return new_x,c.d,c.l
def tr2(x,c):
if isinstance(x,dict):
new_x = {}
for k,v in x.items():
if not c.count(v):
new_v = tr2(v,c)
if not c.count(new_v):
new_x[k] = new_v
return new_x
elif isinstance(x,list):
new_x = []
for v in x:
if not c.count(v):
new_v = tr2(v,c)
if not c.count(new_v):
new_x.append( new_v )
return new_x
return x
So what I am trying to do here is for a given json_body which is decoded json into a table using cjson I want to remove a given element by a configurable value conf.remove.json, I feel I am pretty close but its still not working, and is there a better way? Is there a safe way to find the tables "depth" and then reach out like conf.remove.json= I.want.to.remove.this creates the behavior json_table[I][want][to][remove][this] = nil without throwing some kind of NPE?
local configRemovePath= {}
local configRemoveDepth= 0
local recursiveCounter = 1
local function splitString(inputstr)
sep = "%." --Split on .
configRemovePath={}
configRemoveDepth=0
for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
configRemovePath[configRemoveDepth + 1] = str
configRemoveDepth = configRemoveDepth + 1
end
end
local function recursiveSearchAndNullify(jsonTable)
for key, value in pairs(jsonTable) do --unordered search
-- First iteration
--Sample Json below, where conf.remove.json = data.id and nothing happened. conf.remove.json=data.id
--{
--"data": {
-- "d": 2,
-- "id": 1
--}
--}
-- value = {"d": 2, "id": 1}, key = "data", configRemovePath[recursiveCounter] = "data" , configRemovePath ['data','id'] , configRemoveDepth = 2
if(type(value) == "table" and value == configRemovePath[recursiveCounter] and recursiveCounter < configRemoveDepth) then --If the type is table, the current table is one we need to dive into, and we have not exceeded the configurations remove depth level
recursiveCounter = recursiveCounter + 1
jsonTable = recursiveSearchAndNullify(value)
else
if(key == configRemovePath[recursiveCounter] and recursiveCounter == configRemoveDepth) then --We are at the depth to remove and the key matches then we delete.
for key in pairs (jsonTable) do --Remove all occurances of said element
jsonTable[key] = nil
end
end
end
end
return jsonTable
end
for _, name in iter(conf.remove.json) do
splitString(name)
if(configRemoveDepth == 0) then
for name in pairs (json_body) do
json_body[name] = nil
end
else
recursiveCounter = 1 --Reset to 1 for each for call
json_body = recursiveSearchAndNullify(json_body)
end
end
Thanks to any who assist, this is my first day with Lua so I am pretty newb.
This is the official answer, found a better way with the help of Christian Sciberras!
local json_body_test_one = {data = { id = {"a", "b"},d = "2" }} --decoded json w cjson
local json_body_test_two = {data = { { id = "a", d = "1" }, { id = "b", d = "2" } } }
local config_json_remove = "data.id"
local function dump(o) --Method to print test tables for debugging
if type(o) == 'table' then
local s = '{ '
for k,v in pairs(o) do
if type(k) ~= 'number' then k = '"'..k..'"' end
s = s .. '['..k..'] = ' .. dump(v) .. ','
end
return s .. '} '
else
return tostring(o)
end
end
local function splitstring(inputstr, sep)
if sep == nil then
sep = "%." --Dot notation default
end
local t={} ; i=1
for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
t[i] = str
i = i + 1
end
return t
end
local function setjsonprop(json_object, path, newvalue)
local configarray = splitstring(path)
while (#configarray > 1) do
json_object = json_object[table.remove(configarray, 1)]
if(type(json_object) == "table" and #json_object > 0) then
local recursepath = table.concat(configarray, ".")
for _, item in pairs(json_object) do
setjsonprop(item, recursepath, newvalue)
end
return
end
end
json_object[table.remove(configarray, 1)] = newvalue
end
setjsonprop(json_body_test_one, config_json_remove, nil)
print(dump(json_body_test_one))
I have a function that converts a decimal value to binary. I understand I have the logic correct as I can get it to work outside of a function.
def decimaltobinary(value):
invertedbinary = []
value = int(value)
while value >= 1:
value = (value / 2)
invertedbinary.append(value)
value = int(value)
for n, i in enumerate(invertedbinary):
if (round(i) == i):
invertedbinary[n] = 0
else:
invertedbinary[n] = 1
invertedbinary.reverse()
value = ''.join(str(e) for e in invertedbinary)
return value
decimaltobinary(firstvalue)
print (firstvalue)
decimaltobinary(secondvalue)
print (secondvalue)
Let's say firstvalue = 5 and secondvalue = 10. The values returned each time the function is executed should be 101 and 1010 respectively. However, the values I get printed are the starting values of five and ten. Why is this happening?
The code works as expected, but you didn't assign the returned value:
>>> firstvalue = decimaltobinary(5)
>>> firstvalue
'101'
Note that there are easier ways to accomplish your goal:
>>> str(bin(5))[2:]
'101'
>>> "{0:b}".format(10)
'1010'
I'm trying to run the spider found in this crawler and for simplicity sake I'm using this start_url because it is just a list of 320 movies. (So, the crawler won't run for 5 hours as given in the github page).
I crawl using scrapy crawl imdb -o output.json but the output.json file contains nothing. It has just a [ in it.
import scrapy
from texteval.items import ImdbMovie, ImdbReview
import urlparse
import math
import re
class ImdbSpider(scrapy.Spider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [
# "http://www.imdb.com/chart/top",
# "http://www.imdb.com/chart/bottom"
"http://www.imdb.com/search/title?countries=csxx&sort=moviemeter,asc"
]
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.robotstxt.ROBOTSTXT_OBEY': True,
}
base_url = "http://www.imdb.com"
def parse(self, response):
movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href")
for i in xrange(len(movies)):
l = self.base_url + movies[i].extract()
print l
request = scrapy.Request(l, callback=self.parse_movie)
yield request
next = response.xpath("//*[#id='right']/span/a")[-1]
next_url = self.base_url + next.xpath(".//#href")[0].extract()
next_text = next.xpath(".//text()").extract()[0][:4]
if next_text == "Next":
request = scrapy.Request(next_url, callback=self.parse)
yield request
'''
for sel in response.xpath("//table[#class='chart']/tbody/tr"):
url = urlparse.urljoin(response.url, sel.xpath("td[2]/a/#href").extract()[0].strip())
request = scrapy.Request(url, callback=self.parse_movie)
yield request
'''
def parse_movie(self, response):
movie = ImdbMovie()
i1 = response.url.find('/tt') + 1
i2 = response.url.find('?')
i2 = i2 - 1 if i2 > -1 else i2
movie['id'] = response.url[i1:i2]
movie['url'] = "http://www.imdb.com/title/" + movie['id']
r_tmp = response.xpath("//div[#class='titlePageSprite star-box-giga-star']/text()")
if r_tmp is None or r_tmp == "" or len(r_tmp) < 1:
return
movie['rating'] = int(float(r_tmp.extract()[0].strip()) * 10)
movie['title'] = response.xpath("//span[#itemprop='name']/text()").extract()[0]
movie['reviews_url'] = movie['url'] + "/reviews"
# Number of reviews associated with this movie
n = response.xpath("//*[#id='titleUserReviewsTeaser']/div/div[3]/a[2]/text()")
if n is None or n == "" or len(n) < 1:
return
n = n[0].extract().replace("See all ", "").replace(" user reviews", "")\
.replace(" user review", "").replace(",", "").replace(".", "").replace("See ", "")
if n == "one":
n = 1
else:
n = int(n)
movie['number_of_reviews'] = n
r = int(math.ceil(n / 10))
for x in xrange(1, r):
start = x * 10 - 10
url = movie['reviews_url'] + "?start=" + str(start)
request = scrapy.Request(url, callback=self.parse_review)
request.meta['movieObj'] = movie
yield request
def parse_review(self, response):
ranks = response.xpath("//*[#id='tn15content']/div")[0::2]
texts = response.xpath("//*[#id='tn15content']/p")
del texts[-1]
if len(ranks) != len(texts):
return
for i in xrange(0, len(ranks) - 1):
review = ImdbReview()
review['movieObj'] = response.meta['movieObj']
review['text'] = texts[i].xpath("text()").extract()
rating = ranks[i].xpath(".//img[2]/#src").re("-?\\d+")
if rating is None or rating == "" or len(rating) < 1:
return
review['rating'] = int(rating[0])
yield review
Can someone tell me where am I going wrong?
In my opinion, this web site should be load the list of movies use by js. Fristly, I suggest you should check the output about: movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href"). If you want to get js content, you can use webkit in scrapy as a downloader middleware.
I want to loop through different indexed tables by only passing the initial table as an argument.
I currently have this table:
local table = {
stuff_1 = {
categories = {},
[1] = {
name = 'wui',
time = 300
}
},
stuff_2 = {
categories = {'stuff_10', 'stuff_11', 'stuff_12'},
stuff_10 = {
categories = {},
[1] = {
name = 'peo',
time = 150
},
[2] = {
name = 'uik',
time = 15
},
[3] = {
name = 'kpk',
time = 1230
},
[4] = {
name = 'aer',
time = 5000
}
},
stuff_11 = {
categories = {},
[1] = {
name = 'juio',
time = 600
}
},
stuff_12 = {
categories = {},
[1] = {
name = 'erq',
time = 980
},
[2] = {
name = 'faf',
time = 8170
}
}
}
I wanted to make a recursive function to check if the name in any of those tables was equal to some certain thing and return a string.
The recursivity lies in the idea of updating this table with whatever ammount I'd like (or until a certain limit).
I don't understand exactly what's wrong since when I try:
for k, v in pairs(table) do
print(k, v, #v.categories)
end
It correctly prints:
stuff_2 table: 0x10abb0 3
stuff_1 table: 0x10aab8 0
But when passing the table as a parameter to the the function below, it gives this error:
[string "stdin"]:84: attempt to get length of field 'categories' (a nil value)
Function:
function checkMessage(table)
local i = 1
local message = ""
for k, v in pairs(table) do
if(#v.categories == 0) then
while(v[i]) do
if(v[i].name == 'opd') then
if(v[i].time ~= 0) then
message = "return_1"
else
message = "return_2"
end
end
i = i + 1
end
else
checkMessage(table[k])
end
end
return message
end
EDIT: The problem lies in not ignoring that when using pairs onto the table, this doesn't just have tables with a category subtable but it also has a table named category, if this is ignored then the problem is fixed.
You're recursing into subtables that don't have a categories field. Trying to access categories on them yields nil, which you then try to use the length operator on. Hence your error:
attempt to get length of field 'categories' (a nil value)
If you can't hand trace your app, put in more print statements or get a line level debugger.