Ok so for fun I decided to scrape all the users who go to my college who are signed up on the website moodle.
This is the program I made with Node.js and cheerio that scrapes the site, but I can not seem to get the text that is inside the H2 tag.
This is the website I am scraping from, http://moodle.ramapo.edu/user/profile.php?id=2101
All I need to do is just change the ID number and it loops through every student.
var request = require('request'),
cheerio = require('cheerio');
urls = [];
//For just single page, eventually will loop through each page.
request('http://moodle.ramapo.edu/user/profile.php?id=2101', function(err, resp, body){
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('h2.main', '#yui_3_9_1_2_1410303448188_167').each(function(){
//Not sure how to retrieve just the text name of person
});
console.log(urls);
};
});
How do I just select the text inside the H2 tag so that I can log all of them to my console?
That's not the way I'd go about it. Below is a code snippet that should help you out, all you'll need to do is wrap it in a loop and iterate through the urls you want to scrape. I'd also suggest you check out this tutorial Scraping the Web With Node.js
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://moodle.ramapo.edu/user/profile.php?id=2101';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var name;
$('.main').filter(function(){
var data = $(this);
name = data.text();
console.log("name = " + name);
})
}
res.send('Check your console!')
})
})
app.listen('8081')
exports = module.exports = app;
Related
I want to use message.attachments to get images from that and upload those to my website (not on localhost). How would i do that?
I already have a working upload form on my website, but how would i do it from a discord.js bot?
According to the discord.js docs, each attachment has a URL property. You can use this to get the image and upload it.
Example code:
const URLsToFetch = [];
const attachments = message.attachments.array();
for(let i = 0;i<attachments.length;++i){
URLsToFetch.push(attachments[i].url);
}
This would get all the attachment URLs, which you can use http, request, or some other similar module to download it and then write it to wherever you put your uploads:
const http = require("http");
const https = require("https");
const {URL} = require("url");
const fs = require("fs");
for(let url of URLsToFetch){
const uri = new URL(url);
const protocol = uri.protocol;
let proto = http;
if(protocol === "https:"){
proto = https;
}
proto.get(uri,response=>{
const chunks = [];
response.on("data",chunk={
chunks.push(chunk);
});
response.on("end",()=>{
const file = Buffer.concat(chunks);
fs.writeFile("path/to/filename",file,err=>{
if(err){throw err} // error
// successfully wrote file
});
});
});
}
I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me
So I am trying my hand at Node.js. I want to build a simple crawler which scans a page and then returns all links back in a json file. However, when I run the script it returns 0 links.
Here is my code in its entirety:
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = 'https://stackoverflow.com/questions';
//Create the blank array to fill:
var obj = {
table: []
};
var i = 0;
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
i++;
});
});
var json = JSON.stringify(obj);
console.log(json);
The output in the terminal is so:
$ !!
node nodetest.js
{"table":[]}
Can anyone see why this is blank? Bonus points for writing the final json to a file :)
You must use obj inside the success callback of the request, that's where it gets populated:
request(url, function(err, resp, body) {
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link) {
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
});
// Only here you can be sure that the "obj" variable is properly
// populated because that's where the HTTP request completes
var json = JSON.stringify(obj);
console.log(json);
});
In your code you have placed the console.log outside the request success which is asynchronous and thus the obj variable is not yet populated.
Also notice that you don't need the i variable. It will be passed to the each callback automatically, you don't need to be explicitly declaring or incrementing it.
As far as writing the result to a file is concerned, you could use the fs.writeFile function:
fs.writeFile("/tmp/test", json, function(err) {
if(!err) {
console.log("File successfully saved");
}
});
I am trying to do web scraping and i would like to display the data in JSON format.
My task is to extract each post from the website and display its relevant data in JSON format. My issue is that i cannot seem to target the row () and then target each id. I can input the id in my code but i would like for the program to seacrh for the id and console log the data of each id in the row.
Example: I want to get the title for the first post by id.
I hope i am making sense.
The website i am trying to extract data from:
My code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var port = 8080;
var url= "https://news.ycombinator.com/";
request(url, function(err,resp,body){
var $ = cheerio.load(body);
var title = $('tr');
var uri
var author
var points
var comments
var rank
var posts = {
postTitle : title,
postUri : uri,
postAuthor : author,
postPoints : points,
postComments : comments,
postRank : rank
}
console.log(posts)
})
app.listen(port);
console.log('server is listening on' + port);
The trick with hackernews is that three tr elements display one row. Thats why each element of rows inherits three subsequent elements of tr. Inside rows.map each item is one row and you can access the attributes "rowwise".
let cheerio = require('cheerio')
let request = require('request');
const url = "https://news.ycombinator.com/";
request(url, function(err,resp,body){
let $ = cheerio.load(body);
const tr = $('.itemlist > tr');
let rows = Array((tr.length - 2)/3); //the last two are the More button
for (var i = 0; i < (tr.length - 2)/3; ++i){
rows[i] = tr.slice(3*i, 3*(i+1));
}
res = rows.map(function(item, index) {
return {
postTitle: $(item).find('.storylink').text(),
postUri: $(item).find('.storylink').attr('href'),
postComments: $(item).find('a+ a').text(),
}
})
console.log(res);
})
Which gives you:
[ { postTitle: 'CockroachDB beta-20161013',
postUri: 'https://jepsen.io/analyses/cockroachdb-beta-20161013',
postComments: '10 comments' },
{ postTitle: 'Attacking the Windows Nvidia Driver',
postUri: 'https://googleprojectzero.blogspot.com/2017/02/attacking-windows-nvidia-driver.html',
postComments: '7 comments' },
{ postTitle: 'DuckDuckGo Donates $300K to Raise the Standard of Trust Online',
postUri: 'https://spreadprivacy.com/2017-donations-d6e4e4230b88#.kazx95v27',
postComments: '25 comments' },
... ]
I am creating a script using node.js,fbgraph api and express framework. I POST access_token of user from a page index.html on nodejs server. I am able to retrieve the access_token and I used the fbgraph api to retrieve further user info. But when i try to send the response Json object i am getting this error Cannot GET /.
Here are my code , I am not able to understand where is problem coming , everything seems to work. I checked other questions also , they are not helpful in my case, I dont need to show any template. I only want to return response.
NOTE: In my project folder file structure s like this :-
node_modules
app.js
package.json
CODE: app.js
var bodyParser = require('body-parser');
var express = require('express');
var graph = require('fbgraph');
var app = express();
app.use(bodyParser());
//Retrieve POST data
app.post('/', function(req, res) {
// console.log(req.body.access_token);
var access_token = req.body.access_token;
//set access token
graph.setAccessToken(access_token);
//Graph Api request
graph.get("/me?access_token="+access_token, function(err, b_res) {
// console.log(b_res)
var name = b_res.name;
var id = b_res.id;
var profileUrl = b_res.link;
//Retrieve profile url
graph.get("/"+id+"/?fields=picture", function(err, g_res) {
//JSON object to be returned
var userObj = {
"name": name,
"id": id,
"profilerl": profileUrl,
"picurl": g_res.picture.data.url
};
console.log(userObj);
res.json(userObj);
//res.send(userObj);
});
});
});
app.use(express.static(__dirname + '/'));
app.set('port', process.env.PORT || 3000);
var server = app.listen(app.get('port'));
As discussed in above comments you can perform db operations here in the same page using userObj attributes in where clause or whatever other operation you want and then pass db returned object in the view like this
var bodyParser = require('body-parser');
var express = require('express');
var graph = require('fbgraph');
var app = express();
app.use(bodyParser());
//Retrieve POST data
app.post('/', function(req, res) {
// console.log(req.body.access_token);
var access_token = req.body.access_token;
//set access token
graph.setAccessToken(access_token);
//Graph Api request
graph.get("/me?access_token="+access_token, function(err, b_res) {
// console.log(b_res)
var name = b_res.name;
var id = b_res.id;
var profileUrl = b_res.link;
//Retrieve profile url
graph.get("/"+id+"/?fields=picture", function(err, g_res) {
//JSON object to be returned
var userObj = {
"name": name,
"id": id,
"profilerl": profileUrl,
"picurl": g_res.picture.data.url
};
console.log(userObj);
//res.json(userObj);
//res.send(userObj);
//perform db operation using userObj and when you get the returned object from db pass it to the view. Let say dataAfterDbOpeations is the returned object of ur query
res.render('views/index', {data: dataAfterDbOpeations})
});
});
});
app.use(express.static(__dirname + '/'));
app.set('port', process.env.PORT || 3000);
var server = app.listen(app.get('port'));