I'm trying to scrape image URL's from Amazon products, for example, this link.
In the page source code, there is a section which contains all the urls for images of different sizes (large, medium, hirez, etc). I can get that part of the script by doing, with scrapy,
imagesString = (response.xpath('//script[contains(., "ImageBlockATF")]/text()').extract_first())
Which gives me a string that looks like this,
P.when('A').register("ImageBlockATF", function(A){
var data = {
'colorImages': { 'initial': [{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/31HoKqtljqL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/31HoKqtljqL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX355_.jpg":[308,355],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX450_.jpg":[390,450],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX425_.jpg":[369,425],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX466_.jpg":[404,466],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX522_.jpg":[453,522],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX569_.jpg":[494,569],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX679_.jpg":[589,679]},"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/31Y%2B8oE5DtL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/31Y%2B8oE5DtL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX355_.jpg":[308,355],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX450_.jpg":[390,450],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX425_.jpg":[369,425],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX466_.jpg":[404,466],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX522_.jpg":[453,522],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX569_.jpg":[494,569],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX679_.jpg":[589,679]},"variant":"PT01","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL.jpg":[333,500]},"variant":"PT02","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL.jpg":[333,500]},"variant":"PT03","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL.jpg":[333,500]},"variant":"PT04","lowRes":null}]},
'colorToAsin': {'initial': {}},
'holderRatio': 1.0,
'holderMaxHeight': 700,
'heroImage': {'initial': []},
'heroVideo': {'initial': []},
'spin360ColorData': {'initial': {}},
'spin360ColorEnabled': {'initial': 0},
'spin360ConfigEnabled': false,
'spin360LazyLoadEnabled': false,
'playVideoInImmersiveView':'false',
'tabbedImmersiveViewTreatment':'T2',
'totalVideoCount':'0',
'videoIngressATFSlateThumbURL':'',
'mediaTypeCount':'0',
'atfEnhancedHoverOverlay' : true,
'winningAsin': 'B00XLSS79Y',
'weblabs' : {},
'aibExp3Layout' : 1,
'aibRuleName' : 'frank-powered',
'acEnabled' : false
};
A.trigger('P.AboveTheFold'); // trigger ATF event.
return data;
});
My goal is to get into a Json dictionary the data inside colorImages, so then I can easily get each URL.
I tried doing something like this:
m = re.search(r'^var data = ({.*};)', imagesString , re.S | re.M)
data = m.group()
jsonObj = json.loads(data[:-1].replace("'", '"'))
But it seems that imagesString does not work well with re.search, I keep getting errors regarding imagesString not being a string when it actually is.
I got similar data from an amazon page by using re.findall, something like this (script is a chunk of text i got from the page).
variationValues = re.findall(r'variationValues\" : ({.*?})', ' '.join(script))[0]
and then
variationValuesDict = json.loads(variationValues)
But my knowledge of regular expressions is not that great.
From the string I pasted above, I erased the start and end so only the data remained, so I was left with this:
https://jsoneditoronline.org/?id=9ea92643044f4ac88bcc3e76d98425fc
I can't figure out how to get colorImages with re.findall() (or the data in the json editor) so I can then load it into Json and use it like a dictionary, any ideas on how to achieve this?
You just need to initially convert the var data to the correct markup json. It is easy ))) Just replace all chars ' to " and delete SPACES. And you will get json object:
(It's your right json)
Related
I have built a customer connector to connect to the Vimeo API via OAuth2. Everything is working well, but it appears I need to come up with a solution to deal with pagination, as I am only getting back 25 items on each page.
I see the documentation on how to use Table.GenerateByPage and getNextPage here:
https://learn.microsoft.com/en-us/power-query/samples/trippin/5-paging/readme#tablegeneratebypage
As well as the implementation within the example GitHub custom connector
https://github.com/microsoft/DataConnectors/blob/master/samples/Github/github.pq
A sample of functions from that example:
Github.Contents = (url as text) =>
let
content = Web.Contents(url),
link = GetNextLink(content),
json = Json.Document(content),
table = Table.FromList(json, Splitter.SplitByNothing())
in
table meta [Next=link];
Github.PagedTable = (url as text) => Table.GenerateByPage((previous) =>
let
// If we have a previous page, get its Next link from metadata on the page.
next = if (previous <> null) then Value.Metadata(previous)[Next] else null,
// If we have a next link, use it, otherwise use the original URL that was passed in.
urlToUse = if (next <> null) then next else url,
// If we have a previous page, but don't have a next link, then we're done paging.
// Otherwise retrieve the next page.
current = if (previous <> null and next = null) then null else Github.Contents(urlToUse),
// If we got data back from the current page, get the link for the next page
link = if (current <> null) then Value.Metadata(current)[Next] else null
in
current meta [Next=link]);
GetNextLink = (response, optional request) =>
let
// extract the "Link" header if it exists
link = Value.Metadata(response)[Headers][#"Link"]?,
links = Text.Split(link, ","),
splitLinks = List.Transform(links, each Text.Split(Text.Trim(_), ";")),
next = List.Select(splitLinks, each Text.Trim(_{1}) = "rel=""next"""),
first = List.First(next),
removedBrackets = Text.Range(first{0}, 1, Text.Length(first{0}) - 2)
in
try removedBrackets otherwise null;
However, my issue is that the metadata on pagination that returns from the Vimeo API is coming through the JSON body response instead of within the headers, as is assumed in the documentation and examples. Is there an easy way or helper function within Power Query/M that would allow me to look into the body of the JSON response, grab the pagination JSON objects (as below), and built out my code from there?
Here is what comes back regarding pagination from Vimeo's API within the JSON body:
"total": 1012,
"page": 1,
"per_page": 25,
"paging": {
"next": "/users/{our-user-id}/videos?page=2",
"previous": null,
"first": "/users/{our-user-id}/videos?page=1",
"last": "/users/{our-user-id}/videos?page=41"
},
Many thanks for any help - it is very much appreciated!
Best,
-Josh
Its hard to put something together from just that info, but see if this helps
GetNextLink = (response) =>
// response is data already run through Web.Contents()
// looks for a row that contains "first":
// x would evaluate to be "first": "/users/{our-user-id}/videos?page=1",
// y would parse x to get /users/{our-user-id}/videos?page=1
Source = Lines.FromBinary(response),
x = List.RemoveNulls(List.Transform(List.Positions(Source), each if Text.Contains(Source{_},"""first"":") then Source{_} else null)){0},
y=Text.BetweenDelimiters( x,": ""","""")
in y
I'm looking for the "opposite" Format of JSFormat from the JSTools. Here an example:
JSON code example:
title = Automatic at 07.02.17 & appId = ID_1 & data = {
"base": "+:background1,background2",
"content": [{
"appTitle": "Soil",
"service": {
"serviceType": "AG",
"Url": "http://test.de/xxx"
},
"opacity": "1"]
}
],
"center": "4544320.372869264,5469450.086030475,31468"
}
& context = PARAMETERS
and I Need to convert the Format to the following format:
title=Automatic at 07.02.17 &appId=ID_1&data={"base":"+:background1,background2","content":[{"appTitle":"Soil","service":{"serviceType":"AG","Url":"http://test.de/xxx"},"opacity":"1"]}],"center":"4544320.372869264,5469450.086030475,31468"}&context=PARAMETERS
which is a decoded URL (with MIME Tools) from this html POST:
title%3DAutomatic%20at%2007.02.17%20%26appId%3DID_1%26data%3D%7B%22base%22%3A%22+%3Abackground1,background2%22,%22content%22%3A%5B%7B%22appTitle%22%3A%22Soil%22,%22service%22%3A%7B%22serviceType%22%3A%22AG%22,%22Url%22%3A%22http%3A%2F%2Ftest.de%2Fxxx%22%7D,%22opacity%22%3A%221%22%5D%7D%5D,%22center%22%3A%224544320.372869264,5469450.086030475,31468%22%7D%26context%3DPARAMETERS%0D%0A
which I have to come back after doing changes in the JSON code. From the second to the third Format I can use URL encode (MIME Tools), but what about the reformating from the first to the second Format.
My question: Do you have ideas how to turn the first (JSON) Format into the second (decoded URL) in Notepad++? Something like the "opposite" of JSFormat?
If I understand correctly you basically need to put your JSON on a single line removing new lines and spaces.
This should be achieved with these steps:
CTRL + H to replace occurrences of more than one space with empty string using this regex: [ ]{2,} (remember to select "Regular expression" radiobutton). If this is not exactly what you want you can adjust the regular expression to achieve desired output
select all your JSON CTRL + A
put everything on a single line with join CTRL + J
You can also record a macro to automate this process and run it with a keyboard shortcut.
I'm working on a digest email to send to users of my companies app. For this I'm going through each users emails and trying to find some basic information about each email (from, subject, timestamp, and, the aspect that's causing me difficulty, an image).
I assumed Nokogiri's search('img') function would be fine to pull out images. Unfortunately it looks like most emails have a lot of garbage embedded in the URLs of those images, like newlines ("\n"), escape characters ("\"), and the string "3D" for some reason. For example:
<img src=3D\"https://=\r\nd3ui957tjb5bqd.cloudfront.net/images/emails/1/logo.png\"
This is causing the search to only pull out pieces of the actual URLs/src's:
#(Element:0x3fd0c8e83b80 {
name = "img",
attributes = [
#(Attr:0x3fd0c8e82a28 { name = "src", value = "3D%22https://=" }),
#(Attr:0x3fd0c8e82a14 { name = "d3ui957tjb5bqd.cloudfront.net", value = "" }),
#(Attr:0x3fd0c8e82a00 { name = "width", value = "3D\"223\"" }),
#(Attr:0x3fd0c8e829ec { name = "heigh", value = "t=3D\"84\"" }),
#(Attr:0x3fd0c8e829d8 { name = "alt", value = "3D\"Creative" }),
#(Attr:0x3fd0c8e829c4 { name = "market", value = "" }),
#(Attr:0x3fd0c8e829b0 { name = "border", value = "3D\"0\"" })]
})
Does anyone have an idea why this is happening, and how to remove all this junk?
I'm getting decent results from lots of gsub's and safety checks but it feels pretty tacky.
I've also tried Sanitize.clean which doesn't work and the PermitScrubber mentioned in "How to sanitize html string except image url?".
The mail body is encoded as quoted printable. You will need to decode the body before you parse it with Nokogiri. You can do this fairly easily with Ruby using unpack:
decoded = encoded.unpack('M').first
You should check what the encoding is by looking at the mail headers before trying to decode, not all mail is encoded this way, and there are other types of encoding.
I am not a master in scraping, but you are able to get it through the CSS attribute
.at_css("img")['src']
For example:
require "open-uri"
require "nokogiri"
doc = open(url_link)
page = Nokogiri::HTML(doc)
page.css("div.col-xs-12.visible-xs.visible-sm div.school-image").each do |pic|
img = pic.at_css("img")['src'].downcase if pic.at_css("img")
end
After retrieving results from the Google Custom Search API and writing it to JSON, I want to parse that JSON to make valid Elasticsearch documents. You can configure a parent - child relationship for nested results. However, this relationship seems to not be inferred by the data structure itself. I've tried automatically loading, but not results.
Below is some example input that doesn't include things like id or index. I'm trying to focus on creating the correct data structure. I've tried modifying graph algorithms like depth-first-search but am running into problems with the different data structures.
Here's some example input:
# mock data structure
google = {"content": "foo",
"results": {"result_one": {"persona": "phone",
"personb": "phone",
"personc": "phone"
},
"result_two": ["thing1",
"thing2",
"thing3"
],
"result_three": "none"
},
"query": ["Taylor Swift", "Bob Dole", "Rocketman"]
}
# correctly formatted documents for _source of elasticsearch entry
correct_documents = [
{"content":"foo"},
{"results": ["result_one", "result_two", "result_three"]},
{"result_one": ["persona", "personb", "personc"]},
{"persona": "phone"},
{"personb": "phone"},
{"personc": "phone"},
{"result_two":["thing1","thing2","thing3"]},
{"result_three": "none"},
{"query": ["Taylor Swift", "Bob Dole", "Rocketman"]}
]
Here is my current approach this is still a work in progress:
def recursive_dfs(graph, start, path=[]):
'''recursive depth first search from start'''
path=path+[start]
for node in graph[start]:
if not node in path:
path=recursive_dfs(graph, node, path)
return path
def branching(google):
""" Get branches as a starting point for dfs"""
branch = 0
while branch < len(google):
if google[google.keys()[branch]] is dict:
#recursive_dfs(google, google[google.keys()[branch]])
pass
else:
print("branch {}: result {}\n".format(branch, google[google.keys()[branch]]))
branch += 1
branching(google)
You can see that recursive_dfs() still needs to be modified to handle string, and list data structures.
I'll keep going at this but if you have thoughts, suggestions, or solutions then I would very much appreciate it. Thanks for your time.
here is a possible answer to your problem.
def myfunk( inHole, outHole):
for keys in inHole.keys():
is_list = isinstance(inHole[keys],list);
is_dict = isinstance(inHole[keys],dict);
if is_list:
element = inHole[keys];
new_element = {keys:element};
outHole.append(new_element);
if is_dict:
element = inHole[keys].keys();
new_element = {keys:element};
outHole.append(new_element);
myfunk(inHole[keys], outHole);
if not(is_list or is_dict):
new_element = {keys:inHole[keys]};
outHole.append(new_element);
return outHole.sort();
I'm trying to take data in from a JSON file and link it to my geoJSON file to create a choropleth map with the county colours bound to the "amount" value but also I would like a corresponding "comment" value to be bound to a div for when I mouseover that county.
My code at http://bl.ocks.org/eoiny/6244102 will work to generate a choropleth map when my counties.json data is in the form:
"Carlow":3,"Cavan":4,"Clare":5,"Cork":3,
But things get tricky when I try to use the following form:
{
"id":"Carlow",
"amount":11,
"comment":"The figures for Carlow show a something." },
I can't get my head around how join the "id": "Carlow" from counties.json and "id": "Carlow" path created from ireland.json, while at the same time to have access to the other values in counties.json i.e. "amount" and "comment".
Apologies for my inarticulate question but if anyone could point me to an example or reference I could look up that would be great.
I would preprocess the data when it's loaded to make lookup easier in your quantize function. Basically, replace this: data = json; with this:
data = json.reduce(function(result, county) {
result[county.id] = county;
return result;
}, {});
and then in your quantize function, you get at the amounts like this:
function quantize(d) {
return "q" + Math.min(8, ~~(data[d.id].amount * 9 / 12)) + "-9";
}
What the preprocessing does is turn this array (easily accessed by index):
[{id: 'xyz', ...}, {id: 'pdq', ...}, ...]
into this object with county keys (easily accessed by county id):
{'xyz': {id: 'xyz', ...}, 'pdq': {id: 'pdq', ...}, ...}
Here's the working gist: http://bl.ocks.org/rwaldin/6244803