Get only one value with cheerio from a web page (node js) - cheerio

I've got an issue to get only one value and with my code below I get all values from div named toto. What can I do if I only want the value from the 4th div ? I hope that's clear :)
Here is my code :
const request = require('request');
const cheerio = require('cheerio');
request('https://www....', (error, response, html) =>{
if(!error && response.statusCode == 200){
const $ = cheerio.load(html);
$('div.toto').each((i,ele) => {
const title = $(ele).text();
console.log(title);
});
}
})
I'm sure it's a rookie question but I'm stuck at this point.
Thanks a lot for your help !

Related

Error: Malformed UTF-8 data stringify (while scraping yahoo finance historical)

I have a script to scrape Yahoo Historical Data but it looks like the decrypt service stopped working.
function Scrapeyahoo(symbol) {
//modificación del 27/1/23 hecha por Tanaike
// https://stackoverflow.com/questions/75250562/google-apps-script-stopped-scraping-data-from-yahoo-finance/75253348#75253348
const s = encodeURI(symbol);
const url = 'https://finance.yahoo.com/quote/' +s +'/history?p=' +s;
var html = UrlFetchApp.fetch(url).getContentText().match(/root.App.main = ([\s\S\w]+?);\n/);
if (!html || html.length == 1) return;
var obj = JSON.parse(html[1].trim());
var key = [...new Map(Object.entries(obj).filter(([k]) => !["context", "plugins"].includes(k)).splice(-4)).values()].join("");
if (!key) return;
const cdnjs = "https://cdnjs.cloudflare.com/ajax/libs/crypto-js/4.1.1/crypto-js.min.js";
eval(UrlFetchApp.fetch(cdnjs).getContentText());
const obj1 = JSON.parse(CryptoJS.enc.Utf8.stringify(CryptoJS.AES.decrypt(obj.context.dispatcher.stores, key)));
const header = ["date", "open", "high", "low", "close", "adjclose", "volume"];
const ar = obj1.HistoricalPriceStore.prices.map(o => header.map(h => h == "date" ? new Date(o[h] * 1000) : (o[h] || "")));
return ar
}
I get the error Malformed UTF-8 data stringify in the line
JSON.parse(CryptoJS.enc.Utf8.stringify(CryptoJS.AES.decrypt(obj.context.dispatcher.stores, key)));
A few weeks ago #Tanaike solved here a similar issue, but it looks like there has been new changes.
I ask for help with this problem.
Thanks in advance.
It seems that the specification for retrieving the key has been changed. In this case, vvar key = [...new Map(Object.entries(obj).filter(([k]) => !["context", "plugins"].includes(k)).splice(-4)).values()].join(""); doesn't return the correct key. And also, it seems that the logic for retrieving the valid key has been changed. But, unfortunately, I cannot still find the correct logic. So, in this answer, I would like to refer to this thread. In this thread, the valid keys are listed in a text file. When this is reflected in your script, it becomes as follows.
Modified script:
function Scrapeyahoo(symbol) {
const s = encodeURI(symbol);
const url = 'https://finance.yahoo.com/quote/' + s + '/history?p=' + s;
var html = UrlFetchApp.fetch(url).getContentText().match(/root.App.main = ([\s\S\w]+?);\n/);
if (!html || html.length == 1) return;
var obj = JSON.parse(html[1].trim());
const cdnjs = "https://cdnjs.cloudflare.com/ajax/libs/crypto-js/4.1.1/crypto-js.min.js";
eval(UrlFetchApp.fetch(cdnjs).getContentText());
const keyFile = "https://github.com/ranaroussi/yfinance/raw/main/yfinance/scrapers/yahoo-keys.txt";
const res = UrlFetchApp.fetch(keyFile);
const keys = res.getContentText().split("\n").filter(String);
let obj1 = keys.reduce((ar, key) => {
try {
const o = JSON.parse(CryptoJS.enc.Utf8.stringify(CryptoJS.AES.decrypt(obj.context.dispatcher.stores, key.trim())));
ar.push(o);
} catch (e) {
// console.log(e.message)
}
return ar;
}, []);
if (obj1.length == 0) {
throw new Error("Specification at the server side might be changed. Please check it.");
}
obj1 = obj1[0];
const header = ["date", "open", "high", "low", "close", "adjclose", "volume"];
const ar = obj1.HistoricalPriceStore.prices.map(o => header.map(h => h == "date" ? new Date(o[h] * 1000) : (o[h] || "")));
return ar
}
When I tested this script with a sample value of CL=F as symbol, I confirmed that the script worked.
Note:
In this sample, in order to load crypto-js, eval(UrlFetchApp.fetch(cdnjs).getContentText()) is used. But, if you don’t want to use it, you can also use this script by copying and pasting the script of https://cdnjs.cloudflare.com/ajax/libs/crypto-js/4.1.1/crypto-js.min.js to the script editor of Google Apps Script. By this, the process cost can be reduced.
I can confirm that this method can be used for the current situation (February 15, 2023). But, when the specification in the data and HTML is changed in the future update on the server side, this script might not be able to be used. Please be careful about this.
Reference:
crypto-js

Failed to use map function in the right way within google apps script

I can't use map function in the right way within google apps script while scraping two fields—movie name and image— from a webpage.
function scrapeMovies() {
const URL = "https://yts.am/browse-movies";
const response = UrlFetchApp.fetch(URL);
const $ = Cheerio.load(response.getContentText());
const container = $(".browse-movie-wrap");
const result = container.map(item => {
const movieName = $(item).find('a.browse-movie-title').text();
const movieImage = $(item).find('img.img-responsive').attr('src');
console.log(movieName,movieImage);
});
}
When I execute the script, all I get is undefined as result.
You can still use map but you need to change the way you access the element.
The reason it is undefined is because you were trying to do a find on the index value. Upon testing, container on each element returns [index, item] instead of [item, index]. Specifying you want the 2nd element will fix the issue.
const result = container.map((index, item) => {
const movieName = $(item).find('a.browse-movie-title').text();
const movieImage = $(item).find('img.img-responsive').attr('src');
console.log(movieName, movieImage);
});
But since you aren't returning anything, just use each as mentioned by Sysix.
Note:
For some reason, execution doesn't end if I return both values into result when using map and trying to log result.
I tested another way to store the data and the script below worked.
var result = [];
container.each((index, item) => {
const movieName = $(item).find('a.browse-movie-title').text();
const movieImage = $(item).find('img.img-responsive').attr('src');
result.push([movieName, movieImage]);
});
console.log(result);

Node.js + Cheerio Scraping : How to post in a contact form?

I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me

Parsing HTML/XML with XPath in node.js

I'm trying to write an XPath statement to fetch the contents of each row in a table, but only when the 2nd column of each row is not set to "TBA".
The page I am working off this page. I am new to using XPath.
I've come up with the following statement, which I've managed to test successfully (or appears successful anyway) with an online XPath tester, but have been unable to figure out how to apply it in node.js:
//*[#id="body_column_left"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]
This is my attempt below, I've tried variations but I can't get it to even validate as a valid XPath statement and as a result I've been lost in not very helpful stack traces:
var fs = require('fs');
var xpath = require('xpath');
var parse5 = require('parse5');
var xmlser = require('xmlserializer');
var dom = require('xmldom').DOMParser;
var request = require('request');
var getHTML = function (url, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
return callback(body) // return the HTML
}
})
}
getHTML("http://au.cybergamer.com/pc/csgo/ladder/scheduled/", function (html) {
var parser = new parse5.Parser();
var document = parser.parse(html.toString());
var xhtml = xmlser.serializeToString(document);
var doc = new dom().parseFromString(xhtml);
var select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
var nodes = select("//x:*[#id=\"body_column_left\"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]", doc);
console.log(nodes);
});
Any help would be appreciated!
I ended up solving this issue using cheerioinstead of xpath:
See below:
var $ = cheerio.load(html);
$('.s_grad br').replaceWith("\n");
$('.s_grad thead').remove();
$('.s_grad tr').each(function(i, elem) {
rows[i] = $(this).text();
rows[i] = rows[i].replace(/^\s*[\r\n]/gm, ""); // remove empty newlines
matches.push(new match($(this).find('a').attr('href').substring(7).slice(0, -1))) // create matches
});
How about using this xpath-html, I loved its simplicity.
const xpath = require("xpath-html");
const nodes = xpath
.fromPageSource(html)
.findElements("//img[starts-with(#src, 'https://cloud.shopback.com')]");

Scraping Node.js: Getting text from H2 header

Ok so for fun I decided to scrape all the users who go to my college who are signed up on the website moodle.
This is the program I made with Node.js and cheerio that scrapes the site, but I can not seem to get the text that is inside the H2 tag.
This is the website I am scraping from, http://moodle.ramapo.edu/user/profile.php?id=2101
All I need to do is just change the ID number and it loops through every student.
var request = require('request'),
cheerio = require('cheerio');
urls = [];
//For just single page, eventually will loop through each page.
request('http://moodle.ramapo.edu/user/profile.php?id=2101', function(err, resp, body){
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('h2.main', '#yui_3_9_1_2_1410303448188_167').each(function(){
//Not sure how to retrieve just the text name of person
});
console.log(urls);
};
});
How do I just select the text inside the H2 tag so that I can log all of them to my console?
That's not the way I'd go about it. Below is a code snippet that should help you out, all you'll need to do is wrap it in a loop and iterate through the urls you want to scrape. I'd also suggest you check out this tutorial Scraping the Web With Node.js
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://moodle.ramapo.edu/user/profile.php?id=2101';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var name;
$('.main').filter(function(){
var data = $(this);
name = data.text();
console.log("name = " + name);
})
}
res.send('Check your console!')
})
})
app.listen('8081')
exports = module.exports = app;