css selector using Cheerio exclusion of double class

css selector using Cheerio exclusion of double class - html

I have a rather "weird" scenario (web scraping). I want to select class=g only but not those with class="g g" (double class g). How to do that in jQuery?
If I use $('.g'), it will select both .g and .g .g
UPDATE 1:
If you don't think .g .g is valid, do View Source in Google search results ;)

Even if I don't think g g is valid (Let me check it)..and therefore $(".g:not(.g.g)") wouldn't work, you could do something like:
var myWeirdElements = $('.g').map(function(){return this.className.indexOf('g g') && this ;});
// or, as mentioned on the comment, using Array.filter method
demo => http://jsfiddle.net/GeAaA/
Edit: about the 2nd comment, you just have to count how many g (simple regex)

Considering the following scenario where you have two Div tag with above classes
<div class="g"></div>
<div class="g g"></div>
Write following to get only those div which have class"g"
alert($("div[class='g']").length);

As an alternative solution to other answers, you can use SerpApi's Google Search Engine Results API. This API makes it possible to forget about figuring out how to extract something from the page, bypass blocks from search engines, and maintain the code (parser) over time.
All you need is to iterate over the structured JSON and get the data you want. Check out the playground.
Usage example (also, check it in the online IDE):
const SerpApi = require('google-search-results-nodejs');
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your api key from serpapi.com
const params = {
engine: "google", // search engine
q: "Coffee", // search query
google_domain: "google.com", // google domain of the search
gl: "us", // parameter defines the country to use for the Google search
hl: "en", // Parameter defines the language to use for the Google search
};
const callback = function(data) {
console.log(data);
};
// Show result as JSON
search.json(params, callback);
Output:
{
"recipes_results":[
{
"title":"Coffee recipes",
"link":"https://www.bbcgoodfood.com/recipes/collection/coffee-recipes",
"source":"BBC Good Food",
"ingredients":[
"Instant coffee"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f35798c0551f1d43ace6237f6830200fa235.jpeg"
},
{
"title":"20 Great Coffee Drinks From Around the World",
"link":"https://insanelygoodrecipes.com/coffee-recipes/",
"source":"Insanely Good Recipes",
"rating":4.5,
"reviews":8,
"ingredients":[
"Turkish coffee",
"vietnamese coffee",
"white chocolate mocha",
"cold brew coffee",
"frappuccino"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f357aa875ca61585258439c617a0846d590a.jpeg"
},
{
"title":"Dalgona Coffee",
"link":"https://www.delish.com/cooking/recipe-ideas/a32072159/dalgona-coffee-recipe/",
"source":"Delish.com",
"rating":3.8,
"reviews":42,
"total_time":"10 min",
"ingredients":[
"Instant coffee",
"ice",
"milk"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f357b2162e9f1c396805c73a0850d499fd01.jpeg"
}
],
"local_map":{
"link":"https://www.google.com/search?gl=us&hl=en&q=Coffee&npsic=0&rflfq=1&rldoc=1&rllag=37769342,-122389993,242&tbm=lcl&sa=X&ved=2ahUKEwiFz_LLo9X4AhUwGlkFHeVCB84QtgN6BAgXEAE",
"image":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/57f06389eb0c57c24320522b45952370.png",
"gps_coordinates":{
"latitude":37.769342,
"longitude":-122.389993,
"altitude":242
}
},
"local_results":{
"more_locations_link":"https://www.google.com/search?gl=us&hl=en&tbs=lf:1,lf_ui:9&tbm=lcl&q=Coffee&rflfq=1&num=10&sa=X&ved=2ahUKEwiFz_LLo9X4AhUwGlkFHeVCB84QjGp6BAgWEAI",
"places":[
{
"position":1,
"title":"Philz Coffee",
"place_id":"14850561974798048043",
"lsig":"AB86z5V2CpkHAoUti4UAfsYs_iF5",
"place_id_search":"https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&lsig=AB86z5V2CpkHAoUti4UAfsYs_iF5&ludocid=14850561974798048043&q=Coffee&tbm=lcl",
"description":"Dine-in·Takeout·Delivery",
"rating":4.5,
"type":"Philz Coffee",
"hours":"San Francisco, CA · In Chase Center",
"address":"4.5(21) · $$ · Coffee shop",
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/db978ef22623ebbaba5a259d7610a82f1e67d73393884509c7d902e95abeda94a9e2ac25fedcf5ad.jpeg"
}, ... other results
]
},
"organic_results":[
{
"position":1,
"title":"Coffee - Wikipedia",
"link":"https://en.wikipedia.org/wiki/Coffee",
"displayed_link":"https://en.wikipedia.org › wiki › Coffee",
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/0293bd7aa87e25161235b6fe723a6c2dae65c3f9f164074026df984d8f3e5820.jpeg",
"snippet":"Coffee is a brewed drink prepared from roasted coffee beans, the seeds of berries from certain flowering plants in the Coffea genus. From the coffee fruit, ...",
"snippet_highlighted_words":[
"Coffee",
"coffee",
"coffee"
],
"sitelinks":{
"inline":[
{
"title":"Coffee bean",
"link":"https://en.wikipedia.org/wiki/Coffee_bean"
},
{
"title":"History",
"link":"https://en.wikipedia.org/wiki/History_of_coffee"
},
{
"title":"Coffee preparation",
"link":"https://en.wikipedia.org/wiki/Coffee_preparation"
},
{
"title":"Coffee production",
"link":"https://en.wikipedia.org/wiki/Coffee_production"
}
]
},
"rich_snippet":{
"bottom":{
"extensions":[
"Region of origin: Horn of Africa and ‎South Ara...‎",
"Color: Black, dark brown, light brown, beige",
"Introduced: 15th century"
],
"detected_extensions":{
"introduced_th_century":15
}
}
},
"about_this_result":{
"source":{
"description":"Wikipedia is a multilingual free online encyclopedia written and maintained by a community of volunteers through open collaboration and a wiki-based editing system. Individual contributors, also called editors, are known as Wikipedians. Wikipedia is the largest and most-read reference work in history.",
"icon":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/0293bd7aa87e25161235b6fe723a6c2d8021773b9a274d74b2c9d60da12b643d997acd2685004b3a989713a5bd03429c.png"
},
"keywords":[
"coffee"
],
"languages":[
"English"
],
"regions":[
"the United States"
]
},
"about_page_link":"https://www.google.com/search?q=About+https://en.wikipedia.org/wiki/Coffee&tbm=ilp&ilps=ADNMCi0tVhSB-fGHOJYgrIxB0xlXYrPGPA",
"cached_page_link":"https://webcache.googleusercontent.com/search?q=cache:U6oJMnF-eeUJ:https://en.wikipedia.org/wiki/Coffee+&cd=13&hl=en&ct=clnk&gl=us",
"related_pages_link":"https://www.google.com/search?gl=us&hl=en&q=related:https://en.wikipedia.org/wiki/Coffee+Coffee"
}, ... other results
],
... and other search results
}
Disclaimer, I work for SerpApiAs an alternative solution to other answers, you can use SerpApi's Google Search Engine Results API. This API makes it possible to forget about figuring out how to extract something from the page, bypass blocks from search engines, and maintain the code (parser) over time.
All you need is to iterate over the structured JSON and get the data you want. Check out the playground.
Usage example (also, check it in the online IDE):
const SerpApi = require('google-search-results-nodejs');
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your api key from serpapi.com
const params = {
engine: "google", // search engine
q: "Coffee", // search query
google_domain: "google.com", // google domain of the search
gl: "us", // parameter defines the country to use for the Google search
hl: "en", // Parameter defines the language to use for the Google search
};
const callback = function(data) {
console.log(data);
};
// Show result as JSON
search.json(params, callback);
Output:
{
"recipes_results":[
{
"title":"Coffee recipes",
"link":"https://www.bbcgoodfood.com/recipes/collection/coffee-recipes",
"source":"BBC Good Food",
"ingredients":[
"Instant coffee"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f35798c0551f1d43ace6237f6830200fa235.jpeg"
},
{
"title":"20 Great Coffee Drinks From Around the World",
"link":"https://insanelygoodrecipes.com/coffee-recipes/",
"source":"Insanely Good Recipes",
"rating":4.5,
"reviews":8,
"ingredients":[
"Turkish coffee",
"vietnamese coffee",
"white chocolate mocha",
"cold brew coffee",
"frappuccino"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f357aa875ca61585258439c617a0846d590a.jpeg"
},
{
"title":"Dalgona Coffee",
"link":"https://www.delish.com/cooking/recipe-ideas/a32072159/dalgona-coffee-recipe/",
"source":"Delish.com",
"rating":3.8,
"reviews":42,
"total_time":"10 min",
"ingredients":[
"Instant coffee",
"ice",
"milk"
],
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/b50a38f136c7abe67e61eef25884f357b2162e9f1c396805c73a0850d499fd01.jpeg"
}
],
"local_map":{
"link":"https://www.google.com/search?gl=us&hl=en&q=Coffee&npsic=0&rflfq=1&rldoc=1&rllag=37769342,-122389993,242&tbm=lcl&sa=X&ved=2ahUKEwiFz_LLo9X4AhUwGlkFHeVCB84QtgN6BAgXEAE",
"image":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/57f06389eb0c57c24320522b45952370.png",
"gps_coordinates":{
"latitude":37.769342,
"longitude":-122.389993,
"altitude":242
}
},
"local_results":{
"more_locations_link":"https://www.google.com/search?gl=us&hl=en&tbs=lf:1,lf_ui:9&tbm=lcl&q=Coffee&rflfq=1&num=10&sa=X&ved=2ahUKEwiFz_LLo9X4AhUwGlkFHeVCB84QjGp6BAgWEAI",
"places":[
{
"position":1,
"title":"Philz Coffee",
"place_id":"14850561974798048043",
"lsig":"AB86z5V2CpkHAoUti4UAfsYs_iF5",
"place_id_search":"https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&lsig=AB86z5V2CpkHAoUti4UAfsYs_iF5&ludocid=14850561974798048043&q=Coffee&tbm=lcl",
"description":"Dine-in·Takeout·Delivery",
"rating":4.5,
"type":"Philz Coffee",
"hours":"San Francisco, CA · In Chase Center",
"address":"4.5(21) · $$ · Coffee shop",
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/db978ef22623ebbaba5a259d7610a82f1e67d73393884509c7d902e95abeda94a9e2ac25fedcf5ad.jpeg"
}, ... other results
]
},
"organic_results":[
{
"position":1,
"title":"Coffee - Wikipedia",
"link":"https://en.wikipedia.org/wiki/Coffee",
"displayed_link":"https://en.wikipedia.org › wiki › Coffee",
"thumbnail":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/0293bd7aa87e25161235b6fe723a6c2dae65c3f9f164074026df984d8f3e5820.jpeg",
"snippet":"Coffee is a brewed drink prepared from roasted coffee beans, the seeds of berries from certain flowering plants in the Coffea genus. From the coffee fruit, ...",
"snippet_highlighted_words":[
"Coffee",
"coffee",
"coffee"
],
"sitelinks":{
"inline":[
{
"title":"Coffee bean",
"link":"https://en.wikipedia.org/wiki/Coffee_bean"
},
{
"title":"History",
"link":"https://en.wikipedia.org/wiki/History_of_coffee"
},
{
"title":"Coffee preparation",
"link":"https://en.wikipedia.org/wiki/Coffee_preparation"
},
{
"title":"Coffee production",
"link":"https://en.wikipedia.org/wiki/Coffee_production"
}
]
},
"rich_snippet":{
"bottom":{
"extensions":[
"Region of origin: Horn of Africa and ‎South Ara...‎",
"Color: Black, dark brown, light brown, beige",
"Introduced: 15th century"
],
"detected_extensions":{
"introduced_th_century":15
}
}
},
"about_this_result":{
"source":{
"description":"Wikipedia is a multilingual free online encyclopedia written and maintained by a community of volunteers through open collaboration and a wiki-based editing system. Individual contributors, also called editors, are known as Wikipedians. Wikipedia is the largest and most-read reference work in history.",
"icon":"https://serpapi.com/searches/62bda4a2f0adfb0a9364cfb4/images/0293bd7aa87e25161235b6fe723a6c2d8021773b9a274d74b2c9d60da12b643d997acd2685004b3a989713a5bd03429c.png"
},
"keywords":[
"coffee"
],
"languages":[
"English"
],
"regions":[
"the United States"
]
},
"about_page_link":"https://www.google.com/search?q=About+https://en.wikipedia.org/wiki/Coffee&tbm=ilp&ilps=ADNMCi0tVhSB-fGHOJYgrIxB0xlXYrPGPA",
"cached_page_link":"https://webcache.googleusercontent.com/search?q=cache:U6oJMnF-eeUJ:https://en.wikipedia.org/wiki/Coffee+&cd=13&hl=en&ct=clnk&gl=us",
"related_pages_link":"https://www.google.com/search?gl=us&hl=en&q=related:https://en.wikipedia.org/wiki/Coffee+Coffee"
}, ... other results
],
... and other search results
}
Disclaimer, I work for SerpApi

Related

How to sort a list of restaurant names by restaurant rating (possibly from Google Places or Yelp Fusion API)

I have a csv file with thousands of restaurant names and addresses that I need to sort by rating (data that is not in the csv). Is there a way to fill in the csv with this data? Possibly with Google Places API or Yelp Fusion API?

Both the Google Places API and Yelp Fusion API let you obtain a restaurant’s rating if you query with the business name and address. I’m going to explain how to do this but, first a caution about compliance. What you describe is clearly against the terms of service for both APIs. The only permitted use of their data is to display it on a publicly available website or app. Fetching and retaining it in a csv file is clearly improper. The APIs are intended for real-time query and immediate display of results for your users.
Google requires that the Places data be displayed in conjunction with a Google map or an approved "powered by Google" image. Additionally, no "pre-fetching, caching, or storage of content" is permitted. For details see https://developers.google.com/places/web-service/policies
Yelp requires attribution, basically requiring you to display the star rating and the Yelp logo with a link back to the business page on Yelp for the restaurant you have queried. See https://www.yelp.com/developers/display_requirements Furthermore, you can’t “cache, record, pre-fetch, or otherwise store any portion of the Yelp Content for a period longer than twenty-four (24) hours from receipt of the Yelp Content, or attempt or provide a means to execute any scraping or "bulk download" operations.” For full text and terms see https://www.yelp.com/developers/api_terms
With the legalese out of the way, here’s how to request a restaurant’s rating from Google Places:
https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=Applebees,234 W 42nd St,New York,NY&inputtype=textquery&fields=formatted_address,name,rating&key=YOUR_API_KEY
And, the JSON response:
{
"candidates": [
{
"formatted_address": "234 W 42nd St, New York, NY 10036, USA",
"name": "Applebee's Grill + Bar",
"rating": 3.6
}
],
"status": "OK"
}
Here is the same request for Yelp Fusion. There is no way to request just the rating. Results always contain everything in their database for the restaurant:
https://api.yelp.com/v3/businesses/search?term=applebees&location=234 W 42nd St,New York,NY&limit=1
JSON response:
{
"businesses": [
{
"id": "gytFjzBw-z5LZD-6JSMChg",
"alias": "applebees-grill-bar-new-york-3",
"name": "Applebee's Grill + Bar",
"image_url": "https://s3-media1.fl.yelpcdn.com/bphoto/CLizyj9S7pMvwGNm2dgdiQ/o.jpg",
"is_closed": false,
"url": "https://www.yelp.com/biz/applebees-grill-bar-new-york-3?adjust_creative=pnOv3Zj2REsNDMU4Z3-SLg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=pnOv3Zj2REsNDMU4Z3-SLg",
"review_count": 444,
"categories": [
{
"alias": "tradamerican",
"title": "American (Traditional)"
},
{
"alias": "burgers",
"title": "Burgers"
},
{
"alias": "sportsbars",
"title": "Sports Bars"
}
],
"rating": 2,
"coordinates": {
"latitude": 40.756442,
"longitude": -73.988838
},
"transactions": [
"delivery",
"pickup"
],
"price": "$$",
"location": {
"address1": "234 W 42nd St",
"address2": "",
"address3": "",
"city": "New York",
"zip_code": "10036",
"country": "US",
"state": "NY",
"display_address": [
"234 W 42nd St",
"New York, NY 10036"
]
},
"phone": "+12123917414",
"display_phone": "(212) 391-7414",
"distance": 5.938732504864397
}
],
"total": 2900,
"region": {
"center": {
"longitude": -73.98880004882812,
"latitude": 40.75648701137637
}
}
}

How to get the phonenumber using google places API

I'm writing a code to get the near by locations as prompted by the user using Node.js and making a call to Google's places API. and the call is as below.
https://maps.googleapis.com/maps/api/place/textsearch/json?query=garage+near+corner+of+
fifth+avenue+NY&key=myApiKey
basically here I'm searching for the Garage near the 5th avenue New York. and the response that I get is as below.
{
"html_attributions": [],
"next_page_token": "CqQCHgEAAKUUxRRzEF_vLKTGE5CLg-kBW7K8ot040l4zcsNHEByHpvjrhZZVOWcyCiPBbtErrX9QZNQ9dZQXCaxynnaJonFjF23_PPhWAmIfIRbY40E4gFioA5o4Gm3_OjkOicROypQQlArqaf2ub6vGoMwDKU1eP8m6SFmiMwm1cS_mghR0IWJ2Q9mH5jqpPtWJd2ENX3VCrDPeoOfhaxHAg3DIG7-eh7WYMlT9r6KAERCok-fXnjI49QVrezZYK52aCY3qeLSPLeXonrE2a79jfEdV1EhpBvb4fV2SB2oysCc_xK_Kv3FW6-Ir3WK2jXslrfRvFk7sIRxtgqAlZ2xCZ43oi7DeTl5733S5j4pbFZQHgsg9grLApa_H_wTN2xN1K9UsEhIQOf7B6Gnh668FPRTZe4X78BoUOMt8UiiVZR-YDJDb5AwyME7yYuY",
"results": [
{
"formatted_address": "39 W 23rd St, New York, NY 10010, United States",
"geometry": {
"location": {
"lat": 40.742537,
"lng": -73.99092399999999
},
"viewport": {
"northeast": {
"lat": 40.7437215802915,
"lng": -73.9896989197085
},
"southwest": {
"lat": 40.7410236197085,
"lng": -73.99239688029151
}
}
},
"icon": "https://maps.gstatic.com/mapfiles/place_api/icons/generic_business-71.png",
"id": "b075e432a0597a31ceaf3062747999291d8d8cf7",
"name": "Park-it 23-24 Operating Corp",
"opening_hours": {
"open_now": false,
"weekday_text": []
},
"place_id": "ChIJJalOaKRZwokRNLnbupB5UrY",
"reference": "CmRSAAAA-HGHWnLtBTIkRA7MlY1sjb8jDIxJQokfHSfQ_MWcLLM7lNB_MlUOwO0TcrQO8IYbUz1_l-HiehIwZ3hvg3sHEAiV-ZYpNL11gY15gz6hQsUFFKEkkfqoo_9R1PzVBd9CEhD7AxenKMzgH0LK4lNPFC8GGhSKMjd_5UnZyA3VRJrRYaw79QcxVg",
"types": [
"parking",
"point_of_interest",
"establishment"
]
}
.
.
.
.
}
I the result received there is no phoneNumber available. can you please let me know if I'm using the correct API to get the near by places and how can I get the phone number along with the result.
Thanks

Currently, there is no way to get phone number from the Places API search response. In order to get a phone number you have to execute a Places details request for each individual place ID.
E.g. https://maps.googleapis.com/maps/api/place/details/json?placeid=ChIJJalOaKRZwokRNLnbupB5UrY&key=YOUR_API_KEY
The response will contain fields formatted_phone_number and international_phone_number
...
"adr_address":"39 W 23rd St, New York, NY 10010-4295, USA",
"formatted_address":"39 W 23rd St, New York, NY 10010, USA",
"formatted_phone_number":"(212) 727-0141",
"geometry":{
"location":{
"lat":40.742537,"lng":-73.99092399999999
},
"viewport":{
"northeast":{
"lat":40.7437215802915,"lng":-73.9896989197085
},
"southwest":{
"lat":40.7410236197085,"lng":-73.99239688029151
}
}
},
"icon":"https://maps.gstatic.com/mapfiles/place_api/icons/generic_business-71.png",
"id":"b075e432a0597a31ceaf3062747999291d8d8cf7",
"international_phone_number":"+1 212-727-0141",
"name":"Park-it 23-24 Operating Corp",
...
I can also see that somebody tried to file a feature request for this in Google issue tracker back in 2011, however Google closed this feature request as infeasible
https://issuetracker.google.com/issues/35820573
Feel free to create a new feature request in Google issue tracker, hopefully Google may reconsider it in 2017.

Get request to Google Search

I'm trying to get HTML with search results from Google. With sending GET request for example to:
https://www.google.ru/?q=1111
But if in browser all is ok, when I'm trying to use it with curl or to get source with "View source" in Google, there is only some Javascript code, no search result. Is that some type of protection? What can I do?

You now have to use the Google Search API to make your GET requests.
All other methods have been blocked.

The page from your question is the Google Search page with the input field.
The search results page is this one:
https://www.google.ru/search?q=1111
Rotate proxies and user agents, and delay similar requests to get the HTML from Google Search results pages with fewer amount of bans.
Or use SerpApi to access HTML and the extracted data from it. It has a free trial.
curl -s 'https://serpapi.com/search?q=coffee'
Output
{
// Omitted
"organic_results": [
{
"position": 1,
"title": "Coffee - Wikipedia",
"link": "https://en.wikipedia.org/wiki/Coffee",
"displayed_link": "en.wikipedia.org › wiki › Coffee",
"snippet": "Coffee is a brewed drink prepared from roasted coffee beans, the seeds of berries from certain Coffea species. When coffee berries turn from green to bright red ...",
"sitelinks": {
"expanded": [
{
"title": "History",
"link": "https://en.wikipedia.org/wiki/History_of_coffee",
"snippet": "The history of coffee dates back to the 15th century, and possibly ..."
},
{
"title": "International Coffee Day",
"link": "https://en.wikipedia.org/wiki/International_Coffee_Day",
"snippet": "International Coffee Day (1 October) is an occasion that is ..."
},
{
"title": "List of coffee drinks",
"link": "https://en.wikipedia.org/wiki/List_of_coffee_drinks",
"snippet": "Milk coffee - Nitro cold brew coffee - List of coffee dishes - ..."
},
{
"title": "Portal:Coffee",
"link": "https://en.wikipedia.org/wiki/Portal:Coffee",
"snippet": "Coffee is a brewed drink prepared from roasted coffee beans, the ..."
},
{
"title": "Coffee bean",
"link": "https://en.wikipedia.org/wiki/Coffee_bean",
"snippet": "A coffee bean is a seed of the Coffea plant and the source for ..."
},
{
"title": "Geisha",
"link": "https://en.wikipedia.org/wiki/Geisha_(coffee)",
"snippet": "Geisha coffee, sometimes referred to as Gesha coffee, is a type of ..."
}
],
"list": [
{
"date": "Color‎: ‎Black, dark brown, light brown, beige"
}
]
},
"rich_snippet": {
"bottom": {
"detected_extensions": {
"introduced_th_century": 15
},
"extensions": [
"Introduced‎: ‎15th century",
"Color‎: ‎Black, dark brown, light brown, beige"
]
}
},
"cached_page_link": "https://webcache.googleusercontent.com/search?q=cache:U6oJMnF-eeUJ:https://en.wikipedia.org/wiki/Coffee+&cd=2&hl=sv&ct=clnk&gl=se",
"related_pages_link": "https://www.google.se/search?gl=se&hl=sv&q=related:https://en.wikipedia.org/wiki/Coffee+coffee&sa=X&ved=2ahUKEwjJ9p2p_KXuAhVlRN8KHf22D8wQHzABegQIAhAJ"
}
},
// ...
}
Disclaimer: I work at SerpApi.

To add a bit more sauce to the answers as they are not correct and do not even respond to your problem.
First of all, it's perfectly legal to scrape Google as long as you do not harm their service through it (DoS-like).
Also the methods have not been blocked, it's just not that simple.
The speed depends on your methods, it does not have to be very slow..
You can scrape ten thousands of keyword pages in a minute if needed.
You will find a better answer to the topic here: Is it ok to scrape data from Google results?
Your problem with curl comes indeed from protection, Google does not allow automated access and it has a very sophisticated set of detection algorithms.
They go from simple user agent checks (that's what stopped you directly) up to artificial intelligence that tries to detect unusual queries or related queries.

You can load it in the browser and then scrape results via Javascript.
Or you can use Google API, but seems that it requires payment if you will request it more then 100 times per day.

How to get API.AI simply send me the JSON data of the conversation?

I am trying to understand if there is an option to get the conversation logs of the discussions with some sort of a webhook.
The API.AI docs only refer to using webhook for fulfilment purposes , but for now I don't plan my server (GCP ENGINE APP) to supply fulfilment but only to log the relevant parameters from each conversation.
Anyone knows how to approach this?

Turn on the webhook feature for the intent. You will be able to get the requests and all the data associated with it. You will be able to send back to API.AI too. Here is the full circle:
{
"id": "891db09a-851c-43dc-81c6-4c6705c94f85",
"timestamp": "2017-01-03T10:31:18.676Z",
"result": {
"source": "agent",
"resolvedQuery": "yes, France",
"action": "show.news",
"actionIncomplete": false,
"parameters": {
"adjective": "",
"subject": "France"
},
"contexts": [
{
"name": "subject",
"parameters": {
"subject.original": "France",
"adjective": "",
"subject": "France",
"adjective.original": ""
},
"lifespan": 5
},
{
"name": "region",
"parameters": {
"subject.original": "France",
"adjective": "",
"subject": "France",
"adjective.original": ""
},
"lifespan": 5
}
],
"metadata": {
"intentId": "34773849-4ac2-4e28-95a5-7abfc061044e",
"webhookUsed": "true",
"webhookForSlotFillingUsed": "false",
"intentName": "subject"
},
"fulfillment": {
"speech": "Here is the latest news\n\n According to Watson the main emotion expressed in the article is: ;( ( sadness )\n\n Son of Equatorial Guinea’s president facing trial in France\n\nPARIS — After years of investigation, France on Monday put the son of the president of Equatorial Guinea on trial for corruption, charged with spending many millions in state funds — much of it allegedly in cash — to feed an opulent lifestyle of fast cars, designer clothes, works of art and...\n\nRead more: https://www.washingtonpost.com/world/europe/son-of-equatorial-guineas-president-facing-trial-in-france/2017/01/02/b03d30d0-d0cb-11e6-9651-54a0154cf5b3_story.html",
"source": "Washington Post",
"displayText": "Here is the latest news. According to Watson the main emotion expressed in the article is: sadness",
"messages": [
{
"type": 0,
"speech": "Here is the latest news\n\n According to Watson the main emotion expressed in the article is: ;( ( sadness )\n\n Son of Equatorial Guinea’s president facing trial in France\n\nPARIS — After years of investigation, France on Monday put the son of the president of Equatorial Guinea on trial for corruption, charged with spending many millions in state funds — much of it allegedly in cash — to feed an opulent lifestyle of fast cars, designer clothes, works of art and...\n\nRead more: https://www.washingtonpost.com/world/europe/son-of-equatorial-guineas-president-facing-trial-in-france/2017/01/02/b03d30d0-d0cb-11e6-9651-54a0154cf5b3_story.html"
}
],
"data": {
"newsAgent": {
"adjective": "",
"subject": "France",
"intent": "subject",
"action": "show.news",
"news": {
"title": "Son of Equatorial Guinea’s president facing trial in France",
"source": "Washington Post",
"link": "https://www.washingtonpost.com/world/europe/son-of-equatorial-guineas-president-facing-trial-in-france/2017/01/02/b03d30d0-d0cb-11e6-9651-54a0154cf5b3_story.html",
"language": "english",
"body": "PARIS — After years of investigation, France on Monday put the son of the president of Equatorial Guinea on trial for corruption, charged with spending many millions in state funds — much of it allegedly in cash — to feed an opulent lifestyle of fast cars, designer clothes, works of art and...",
"emotion": "sadness",
"emoticon": ";("
},
"speech": "Here is the latest news",
"sessionId": "0856125a-d0bc-4cba-990d-cbcfaea536db"
}
}
},
"score": 1
},
"status": {
"code": 206,
"errorType": "partial_content",
"errorDetails": "Webhook call failed. Error message: Webhook contains contexts with empty names or names containing whitespaces. ErrorId: 131000fa-0ec1-4efb-b47c-64301ac7bb2b"
},
"sessionId": "0856125a-d0bc-4cba-990d-cbcfaea536db"
}
The result object is the request that API.AI sends you, you get the contexts objects as well.
The fulfilment object is the response my endpoint sent back to API.AI
Check the documentation

Repeating over nested Objects with Angular JS

I've been playing with the google feed API for a podcasts I run and wanted to include a simple ng-repeat to display the title and link URL to the MP3. However the JSON google provides is nested in several different Objects and Arrays. For instance, my JSON feed looks like this:
{
"responseData": {
"feed": {
"feedUrl": "http://feeds.feedburner.com/stillgotgame",
"title": "2old2play presents Still Got Game",
"link": "http://www.2old2play.com/",
"author": "",
"description": "Still Got Game focuses on the gaming industry from the perspective of adult gamers. We look at news, reviews, and inside information in the world of video games. Each episode touches on the community, the industry, and the games that keep us coming back.",
"type": "rss20",
"entries": [
{
"mediaGroups": [
{
"contents": [
{
"url": "http://traffic.libsyn.com/dsmooth/Still_Got_Game_Episode_33__Coast_to_Coast.mp3",
"fileSize": "35346436",
"type": "audio/mpeg"
}
]
}
],
"title": "Still Got Game Ep. 33: Coast to Coast",
"link": "http://2old2play.com/media/still-got-game-ep-33-coast-coast/",
"author": "podcast#2old2play.com",
"publishedDate": "Tue, 06 May 2014 22:05:01 -0700",
"contentSnippet": "DSmooth finally has his Rocket Bro back. After a multi-week hiatus for Doodirock's move to the West Coast, they boys were back ...",
"content": "DSmooth finally has his Rocket Bro back. After a multi-week hiatus for Doodirock's move to the West Coast, they boys were back in force this week. The duo talk gaming news and the new releases, cover a bunch of viewer feedback, and talk a bit about what may be the worst moving company ever. They'll have you LMFAOing! You can always call the boys at (773) 527-2961 and weigh in yourself, or tune in live Monday nights at 9:00 EDT at http://twitch.tv/2old2play ...",
"categories": [
"Audio"
]
}
]
}
},
"responseDetails": null,
"responseStatus": 200
}
As you can see, in order to get to the items URL to the MP3 I have to go through entries, mediaGroups, and Contents before I even reach the Array I need! I start off inside the entries with this factory I've created:
.factory('audioFEED', function($resource){
return $resource('http://ajax.googleapis.com/ajax/services/feed/load?v=1.0&num=100&q=http://feeds.feedburner.com/stillgotgame',{},
{
query:{
method:'JSONP',
params: {callback: 'JSON_CALLBACK'},
isArray:false,
headers:{
'Access-Control-Allow-Origin': '*'
}
},
});
});
Thats easy enough with just setting up the data on the controller here:
'use strict';
angular.module('twitchappApp')
.controller('audioCtrl', function($scope, audioFEED) {
audioFEED.query(function(data){
$scope.audios = data.responseData.feed.entries;
console.log($scope.audios);
});
});
However, In order to get to that data I'm having to set up multiple ng-repeats with on inside of the next. I would really like to find a better way to handle this data within the controller and access the URL inside one ng-repeat. It seems this way is adding more over head and probably not the best over all method. Is there a best practice for this? My current end result looks like this:
<h1>Audio</h1>
<div ng-repeat="audio in audios">
<h3>{{ audio.title }}</h3>
<p>{{audio.contentSnippet}}</p>
<div ng-repeat="play in audio.mediaGroups">
<div ng-repeat="playurl in play.contents">
PLAY
</div>
</div>
</div>
Yuk...

Check out this JSFiddle. Uses underscore to flatten your data down to an easier to work with array. http://jsfiddle.net/ahchurch/sKeY9/3/
Template
<div ng-controller="MyCtrl">
<div ng-repeat="playurl in contents">
PLAY
</div>
</div>
JavaScript
var myApp = angular.module('myApp',[]);
//myApp.directive('myDirective', function() {});
//myApp.factory('myService', function() {});
function MyCtrl($scope) {
var responseData = {
"responseData": {
"feed": {
"feedUrl": "http://feeds.feedburner.com/stillgotgame",
"title": "2old2play presents Still Got Game",
"link": "http://www.2old2play.com/",
"author": "",
"description": "Still Got Game focuses on the gaming industry from the perspective of adult gamers. We look at news, reviews, and inside information in the world of video games. Each episode touches on the community, the industry, and the games that keep us coming back.",
"type": "rss20",
"entries": [
{
"mediaGroups": [
{
"contents": [
{
"url": "http://traffic.libsyn.com/dsmooth/Still_Got_Game_Episode_33__Coast_to_Coast.mp3",
"fileSize": "35346436",
"type": "audio/mpeg"
}
]
}
],
"title": "Still Got Game Ep. 33: Coast to Coast",
"link": "http://2old2play.com/media/still-got-game-ep-33-coast-coast/",
"author": "podcast#2old2play.com",
"publishedDate": "Tue, 06 May 2014 22:05:01 -0700",
"contentSnippet": "DSmooth finally has his Rocket Bro back. After a multi-week hiatus for Doodirock's move to the West Coast, they boys were back ...",
"content": "DSmooth finally has his Rocket Bro back. After a multi-week hiatus for Doodirock's move to the West Coast, they boys were back in force this week. The duo talk gaming news and the new releases, cover a bunch of viewer feedback, and talk a bit about what may be the worst moving company ever. They'll have you LMFAOing! You can always call the boys at (773) 527-2961 and weigh in yourself, or tune in live Monday nights at 9:00 EDT at http://twitch.tv/2old2play ...",
"categories": [
"Audio"
]
}
]
}
},
"responseDetails": null,
"responseStatus": 200
};
//Underscore:
$scope.contents = _.flatten(_.map(responseData.responseData.feed.entries, function(entry){
return _.map(entry.mediaGroups, function(mediaGroup){
return mediaGroup.contents;
});
}));
$scope.name = 'Superhero';
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

css selector using Cheerio exclusion of double class - html

I have a rather "weird" scenario (web scraping). I want to select class=g only but not those with class="g g" (double class g). How to do that in jQuery? If I use $('.g'), it will select both .g and .g .g UPDATE 1: If you don't think .g .g is valid, do View Source in Google search results ;)

Considering the following scenario where you have two Div tag with above classes <div class="g"></div> <div class="g g"></div> Write following to get only those div which have class"g" alert($("div[class='g']").length);

Related

How to sort a list of restaurant names by restaurant rating (possibly from Google Places or Yelp Fusion API)

How to get the phonenumber using google places API

Get request to Google Search

How to get API.AI simply send me the JSON data of the conversation?

Repeating over nested Objects with Angular JS

Categories

Resources