I am trying to do web scraping and i would like to display the data in JSON format.
My task is to extract each post from the website and display its relevant data in JSON format. My issue is that i cannot seem to target the row () and then target each id. I can input the id in my code but i would like for the program to seacrh for the id and console log the data of each id in the row.
Example: I want to get the title for the first post by id.
I hope i am making sense.
The website i am trying to extract data from:
My code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var port = 8080;
var url= "https://news.ycombinator.com/";
request(url, function(err,resp,body){
var $ = cheerio.load(body);
var title = $('tr');
var uri
var author
var points
var comments
var rank
var posts = {
postTitle : title,
postUri : uri,
postAuthor : author,
postPoints : points,
postComments : comments,
postRank : rank
}
console.log(posts)
})
app.listen(port);
console.log('server is listening on' + port);
The trick with hackernews is that three tr elements display one row. Thats why each element of rows inherits three subsequent elements of tr. Inside rows.map each item is one row and you can access the attributes "rowwise".
let cheerio = require('cheerio')
let request = require('request');
const url = "https://news.ycombinator.com/";
request(url, function(err,resp,body){
let $ = cheerio.load(body);
const tr = $('.itemlist > tr');
let rows = Array((tr.length - 2)/3); //the last two are the More button
for (var i = 0; i < (tr.length - 2)/3; ++i){
rows[i] = tr.slice(3*i, 3*(i+1));
}
res = rows.map(function(item, index) {
return {
postTitle: $(item).find('.storylink').text(),
postUri: $(item).find('.storylink').attr('href'),
postComments: $(item).find('a+ a').text(),
}
})
console.log(res);
})
Which gives you:
[ { postTitle: 'CockroachDB beta-20161013',
postUri: 'https://jepsen.io/analyses/cockroachdb-beta-20161013',
postComments: '10 comments' },
{ postTitle: 'Attacking the Windows Nvidia Driver',
postUri: 'https://googleprojectzero.blogspot.com/2017/02/attacking-windows-nvidia-driver.html',
postComments: '7 comments' },
{ postTitle: 'DuckDuckGo Donates $300K to Raise the Standard of Trust Online',
postUri: 'https://spreadprivacy.com/2017-donations-d6e4e4230b88#.kazx95v27',
postComments: '25 comments' },
... ]
Related
I am trying to get weather forecast data from API of weatherapi.com but when I parsed the JSON data it show error that unexpected end of json input. I also tried setTimeout function as if it takes times to fetching data but not helpful.
const express = require('express');
const https = require('https');
const bodyParser = require('body-parser');
const app = express();
app.use(express.static("public"));
app.use(bodyParser.urlencoded({extended:true}));
app.post("/weather-data",function(req, res){
var city_name = req.body.city_name;
city_name = city_name.toUpperCase();
const key = "4b6f380fa80745beb2c174529222912";
const days = 1;
url = "https://api.weatherapi.com/v1/forecast.json?key="+key+"&q="+city_name;
https.get(url,(response)=>{
console.log(response.statusCode);
const status = response.statusCode;
if(status == 200){
response.on("data",function(data){
const WeatherData = JSON.parse(data);
const region = WeatherData.location.region;
const country = WeatherData.location.country;
console.log("region is "+region+" and country is "+country);
});
}
});
});
Note that response.on("data") event is triggered every time a chunk of data arrives and it can happen multiple times per request (not necessarily all data arrive simultaneously, especially for large payloads).
You should buffer the data and parse it only after all data arrived:
let dataBuffer = '';
response.on("data", function(data) {
dataBuffer += data;
});
response.on("end", function() {
const weatherData = JSON.parse(dataBuffer);
...
...
});
I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me
I'm trying to write an XPath statement to fetch the contents of each row in a table, but only when the 2nd column of each row is not set to "TBA".
The page I am working off this page. I am new to using XPath.
I've come up with the following statement, which I've managed to test successfully (or appears successful anyway) with an online XPath tester, but have been unable to figure out how to apply it in node.js:
//*[#id="body_column_left"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]
This is my attempt below, I've tried variations but I can't get it to even validate as a valid XPath statement and as a result I've been lost in not very helpful stack traces:
var fs = require('fs');
var xpath = require('xpath');
var parse5 = require('parse5');
var xmlser = require('xmlserializer');
var dom = require('xmldom').DOMParser;
var request = require('request');
var getHTML = function (url, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
return callback(body) // return the HTML
}
})
}
getHTML("http://au.cybergamer.com/pc/csgo/ladder/scheduled/", function (html) {
var parser = new parse5.Parser();
var document = parser.parse(html.toString());
var xhtml = xmlser.serializeToString(document);
var doc = new dom().parseFromString(xhtml);
var select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
var nodes = select("//x:*[#id=\"body_column_left\"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]", doc);
console.log(nodes);
});
Any help would be appreciated!
I ended up solving this issue using cheerioinstead of xpath:
See below:
var $ = cheerio.load(html);
$('.s_grad br').replaceWith("\n");
$('.s_grad thead').remove();
$('.s_grad tr').each(function(i, elem) {
rows[i] = $(this).text();
rows[i] = rows[i].replace(/^\s*[\r\n]/gm, ""); // remove empty newlines
matches.push(new match($(this).find('a').attr('href').substring(7).slice(0, -1))) // create matches
});
How about using this xpath-html, I loved its simplicity.
const xpath = require("xpath-html");
const nodes = xpath
.fromPageSource(html)
.findElements("//img[starts-with(#src, 'https://cloud.shopback.com')]");
Ok so for fun I decided to scrape all the users who go to my college who are signed up on the website moodle.
This is the program I made with Node.js and cheerio that scrapes the site, but I can not seem to get the text that is inside the H2 tag.
This is the website I am scraping from, http://moodle.ramapo.edu/user/profile.php?id=2101
All I need to do is just change the ID number and it loops through every student.
var request = require('request'),
cheerio = require('cheerio');
urls = [];
//For just single page, eventually will loop through each page.
request('http://moodle.ramapo.edu/user/profile.php?id=2101', function(err, resp, body){
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('h2.main', '#yui_3_9_1_2_1410303448188_167').each(function(){
//Not sure how to retrieve just the text name of person
});
console.log(urls);
};
});
How do I just select the text inside the H2 tag so that I can log all of them to my console?
That's not the way I'd go about it. Below is a code snippet that should help you out, all you'll need to do is wrap it in a loop and iterate through the urls you want to scrape. I'd also suggest you check out this tutorial Scraping the Web With Node.js
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://moodle.ramapo.edu/user/profile.php?id=2101';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var name;
$('.main').filter(function(){
var data = $(this);
name = data.text();
console.log("name = " + name);
})
}
res.send('Check your console!')
})
})
app.listen('8081')
exports = module.exports = app;
I am creating a script using node.js,fbgraph api and express framework. I POST access_token of user from a page index.html on nodejs server. I am able to retrieve the access_token and I used the fbgraph api to retrieve further user info. But when i try to send the response Json object i am getting this error Cannot GET /.
Here are my code , I am not able to understand where is problem coming , everything seems to work. I checked other questions also , they are not helpful in my case, I dont need to show any template. I only want to return response.
NOTE: In my project folder file structure s like this :-
node_modules
app.js
package.json
CODE: app.js
var bodyParser = require('body-parser');
var express = require('express');
var graph = require('fbgraph');
var app = express();
app.use(bodyParser());
//Retrieve POST data
app.post('/', function(req, res) {
// console.log(req.body.access_token);
var access_token = req.body.access_token;
//set access token
graph.setAccessToken(access_token);
//Graph Api request
graph.get("/me?access_token="+access_token, function(err, b_res) {
// console.log(b_res)
var name = b_res.name;
var id = b_res.id;
var profileUrl = b_res.link;
//Retrieve profile url
graph.get("/"+id+"/?fields=picture", function(err, g_res) {
//JSON object to be returned
var userObj = {
"name": name,
"id": id,
"profilerl": profileUrl,
"picurl": g_res.picture.data.url
};
console.log(userObj);
res.json(userObj);
//res.send(userObj);
});
});
});
app.use(express.static(__dirname + '/'));
app.set('port', process.env.PORT || 3000);
var server = app.listen(app.get('port'));
As discussed in above comments you can perform db operations here in the same page using userObj attributes in where clause or whatever other operation you want and then pass db returned object in the view like this
var bodyParser = require('body-parser');
var express = require('express');
var graph = require('fbgraph');
var app = express();
app.use(bodyParser());
//Retrieve POST data
app.post('/', function(req, res) {
// console.log(req.body.access_token);
var access_token = req.body.access_token;
//set access token
graph.setAccessToken(access_token);
//Graph Api request
graph.get("/me?access_token="+access_token, function(err, b_res) {
// console.log(b_res)
var name = b_res.name;
var id = b_res.id;
var profileUrl = b_res.link;
//Retrieve profile url
graph.get("/"+id+"/?fields=picture", function(err, g_res) {
//JSON object to be returned
var userObj = {
"name": name,
"id": id,
"profilerl": profileUrl,
"picurl": g_res.picture.data.url
};
console.log(userObj);
//res.json(userObj);
//res.send(userObj);
//perform db operation using userObj and when you get the returned object from db pass it to the view. Let say dataAfterDbOpeations is the returned object of ur query
res.render('views/index', {data: dataAfterDbOpeations})
});
});
});
app.use(express.static(__dirname + '/'));
app.set('port', process.env.PORT || 3000);
var server = app.listen(app.get('port'));