My end goal is to open a local html file with javascript embedded, creating a map with polygons, and take a screenshot of it using PhantomJS. I have written a simple JS file to do this:
var page = require('webpage').create();
page.open('https://www.google.com/maps', function(status) {
console.log('State: ' + status);
if(status === 'success') {
page.render('example.pdf', {format: 'pdf', quality: '100'});
}
phantom.exit();
});
This returns the error:
ReferenceError: Can't find variable: google
I've tried this on a local html file and on other websites using google maps and I keep getting the same error. I have been successful in taking a screenshot of other websites without google maps. Searching the internet it doesn't seem like people have had issues like this, and have been successful in taking screenshots of pages with google maps...so I'm wondering what could be wrong.
Another note: I installed PhantomJS as a gem in my rails project and am running the javascript file through the rails console using this gem. I have tried it using the standard installation of PhantomJS (v 2.0.0) and it still didn't work.
You'll have to wait for an element in the DOM.
for example on maps.google.com, you can wait for the watermark which is loaded after all tiles are loaded.
var page = require('webpage').create();
page.open('https://www.google.com/maps', function (status) {
console.log('State: ' + status);
if (status === 'success') {
waitFor(function () {
return page.evaluate(function () {
var document_contains_watermark =
document.body.contains(document.getElementById('watermark'));
return document_contains_watermark;
});
}, function () {
page.render('maps-google-com.pdf', {format: 'pdf', quality: '100'});
phantom.exit();
});
}
});
function waitFor(testFn, onReady) {
var loaded = false;
var interval = setInterval(function () {
loaded = testFn();
if (loaded) {
onReady();
clearInterval(interval);
}
}, 1000);
}
If you want to take a screenshot on a page that you developed, use the same above logic but append by yourself an element on the google maps idle event.
google.maps.event.addListenerOnce(map, 'idle', function () {
var loadedElem = document.createElement('div');
loadedElem.setAttribute("id", "idLoadedElem");
document.body.appendChild(loadedElem);
});
you should give puppeter a go, it makes that easy:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
await page.screenshot({path: 'example.pdf'});
await browser.close();
})();
Related
I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000
I have followed Philippe Leefsma's tutorial on how to implement the markup tool, but without any luck. Link here: http://adndevblog.typepad.com/cloud_and_mobile/2016/02/playing-with-the-new-view-data-markup-api.html
and here: https://developer.api.autodesk.com/viewingservice/v1/viewers/docs/tutorial-feature_markup.html
I get errors that I need to include requireJS, but I don't want to use it. So instead I used this script in my html file:
<script src="https://autodeskviewer.com/viewers/2.2/extensions/MarkupsCore.js">
I don't know if this is the right way to go? I get no errors in the console, but the markup button doesn't show up in the dockingpanel.
This is my code for loading the extension in the viewer:
viewerApp = null;
function initializeViewer(containerId, urn, params) {
function getToken(url) {
return new Promise(function (resolve, reject) {
$.get(url, function (response) {
resolve(response.access_token);
});
});
}
var initOptions = {
documentId: 'urn:' + urn,
env: 'AutodeskProduction',
getAccessToken: function (onGetAccessToken) {
getToken(params.gettokenurl).then(function (val) {
var accessToken = val;
var expireTimeSeconds = 60 * 30;
onGetAccessToken(accessToken, expireTimeSeconds);
});
}
}
function onDocumentLoaded(doc) {
var rootItem = doc.getRootItem();
// Grab all 3D items
var geometryItems3d =
Autodesk.Viewing.Document.getSubItemsWithProperties(
rootItem, { 'type': 'geometry', 'role': '3d' }, true);
// Grab all 2D items
var geometryItems2d =
Autodesk.Viewing.Document.getSubItemsWithProperties(
rootItem, { 'type': 'geometry', 'role': '2d' }, true);
// Pick the first 3D item otherwise first 2D item
var selectedItem = (geometryItems3d.length ?
geometryItems3d[0] :
geometryItems2d[0]);
var domContainer = document.getElementById('viewerContainer');
var config = { extensions: ["Autodesk.Viewing.MarkupsCore"] };
// GUI Version: viewer with controls
var viewer = new Autodesk.Viewing.Private.GuiViewer3D(domContainer, config);
viewer.loadExtension("Autodesk.Viewing.MarkupsCore");
viewer.initialize();
viewer.loadModel(doc.getViewablePath(selectedItem));
var extension = viewer.getExtension("Autodesk.Viewing.MarkupsCore");
viewerApp = viewer;
}
function onEnvInitialized() {
Autodesk.Viewing.Document.load(
initOptions.documentId,
function (doc) {
onDocumentLoaded(doc);
},
function (errCode) {
onLoadError(errCode);
})
}
function onLoadError(errCode) {
console.log('Error loading document: ' + errCode);
}
Autodesk.Viewing.Initializer(
initOptions,
function () {
onEnvInitialized()
})
}
Any help is highly appreciated!
Unfortunately there has been a few changes to the API since I wrote that blog post. The MarkupCore.js is now included in the viewer3D.js source, so you don't need to reference any extra file or use requireJS if you use the latest version of the viewer API.
Keep in mind that this is an API-only feature, so even after loading the markup extension, you won't get any UI out of the box. You have to implemented it yourself, for example create a dialog with buttons that may eventually create markups by calling the API.
Some of the code from my blog post may still be valid and give you an idea about what you need to do.
Hope that helps.
I'm leveraging Google Chrome's omnibox API in my extension.
Current users, including myself, have noticed that the omnibox ceases responding entirely after an undetermined state change or a period of time lapsing. Typing the word to trigger entering into "omnibox" stops having any effect and the URL bar does not shift into omnibox mode.
Restarting Google Chrome does not fix the issue, but restarting my plugin by unchecking and then re-checking the 'enabled' checkbox on chrome://extensions does resolve the issue.
Does anyone have any suggestions on what to investigate? Below is the code used. It is only loaded once through my permanently persisted background page:
// Displays streamus search suggestions and allows instant playing in the stream
define([
'background/collection/streamItems',
'background/model/video',
'common/model/youTubeV2API',
'common/model/utility'
], function (StreamItems, Video, YouTubeV2API, Utility) {
'use strict';
console.log("Omnibox LOADED", chrome.omnibox);
var Omnibox = Backbone.Model.extend({
defaults: function () {
return {
suggestedVideos: [],
searchJqXhr: null
};
},
initialize: function () {
console.log("Omnibox INITIALIZED");
var self = this;
chrome.omnibox.setDefaultSuggestion({
// TODO: i18n
description: 'Press enter to play.'
});
// User has started a keyword input session by typing the extension's keyword. This is guaranteed to be sent exactly once per input session, and before any onInputChanged events.
chrome.omnibox.onInputChanged.addListener(function (text, suggest) {
// Clear suggested videos
self.get('suggestedVideos').length = 0;
var trimmedSearchText = $.trim(text);
// Clear suggestions if there is no text.
if (trimmedSearchText === '') {
suggest();
} else {
// Do not display results if searchText was modified while searching, abort old request.
var previousSearchJqXhr = self.get('searchJqXhr');
if (previousSearchJqXhr) {
previousSearchJqXhr.abort();
self.set('searchJqXhr', null);
}
var searchJqXhr = YouTubeV2API.search({
text: trimmedSearchText,
// Omnibox can only show 6 results
maxResults: 6,
success: function(videoInformationList) {
self.set('searchJqXhr', null);
var suggestions = self.buildSuggestions(videoInformationList, trimmedSearchText);
suggest(suggestions);
}
});
self.set('searchJqXhr', searchJqXhr);
}
});
chrome.omnibox.onInputEntered.addListener(function (text) {
// Find the cached video data by url
var pickedVideo = _.find(self.get('suggestedVideos'), function(suggestedVideo) {
return suggestedVideo.get('url') === text;
});
// If the user doesn't make a selection (commonly when typing and then just hitting enter on their query)
// take the best suggestion related to their text.
if (pickedVideo === undefined) {
pickedVideo = self.get('suggestedVideos')[0];
}
StreamItems.addByVideo(pickedVideo, true);
});
},
buildSuggestions: function(videoInformationList, text) {
var self = this;
var suggestions = _.map(videoInformationList, function (videoInformation) {
var video = new Video({
videoInformation: videoInformation
});
self.get('suggestedVideos').push(video);
var safeTitle = _.escape(video.get('title'));
var textStyleRegExp = new RegExp(Utility.escapeRegExp(text), "i");
var styledTitle = safeTitle.replace(textStyleRegExp, '<match>$&</match>');
var description = '<dim>' + video.get('prettyDuration') + "</dim> " + styledTitle;
return {
content: video.get('url'),
description: description
};
});
return suggestions;
}
});
return new Omnibox();
});
As far as I'm aware the code itself is fine and wouldn't have any effect on whether I see omnibox or not.
You can find full source code here: https://github.com/MeoMix/StreamusChromeExtension/blob/master/src/js/background/model/omnibox.js
I'm testing WebRTC procedure step by step for my sake.
I wrote some testing site for server-less WebRTC.
http://webrtcdevelop.appspot.com/
In fact, STUN server by google is used, but no signalling server deployed.
Session Description Protocol (SDP) is exchanged manually by hand that is CopyPaste between browser windows.
So far, here is the result I've got with the code:
'use strict';
var peerCon;
var ch;
$(document)
.ready(function()
{
init();
$('#remotebtn2')
.attr("disabled", "");
$('#localbtn')
.click(function()
{
offerCreate();
$('#localbtn')
.attr("disabled", "");
$('#remotebtn')
.attr("disabled", "");
$('#remotebtn2')
.removeAttr("disabled");
});
$('#remotebtn')
.click(function()
{
answerCreate(
new RTCSessionDescription(JSON.parse($('#remote')
.val())));
$('#localbtn')
.attr("disabled", "");
$('#remotebtn')
.attr("disabled", "");
$('#remotebtn')
.attr("disabled", "");
});
$('#remotebtn2')
.click(function()
{
answerGet(
new RTCSessionDescription(JSON.parse($('#remote')
.val())));
$('#remotebtn2')
.attr("disabled", "");
});
$('#msgbtn')
.click(function()
{
msgSend($('#msg')
.val());
});
});
var init = function()
{
//offer------
peerCon =
new RTCPeerConnection(
{
"iceServers": [
{
"url": "stun:stun.l.google.com:19302"
}]
},
{
"optional": []
});
var localDescriptionOut = function()
{
console.log(JSON.stringify(peerCon.localDescription));
$('#local')
.text(JSON.stringify(peerCon.localDescription));
};
peerCon.onicecandidate = function(e)
{
console.log(e);
if (e.candidate === null)
{
console.log('candidate empty!');
localDescriptionOut();
}
};
ch = peerCon.createDataChannel(
'ch1',
{
reliable: true
});
ch.onopen = function()
{
dlog('ch.onopen');
};
ch.onmessage = function(e)
{
dlog(e.data);
};
ch.onclose = function(e)
{
dlog('closed');
};
ch.onerror = function(e)
{
dlog('error');
};
};
var msgSend = function(msg)
{
ch.send(msg);
}
var offerCreate = function()
{
peerCon
.createOffer(function(description)
{
peerCon
.setLocalDescription(description, function()
{
//wait for complete of peerCon.onicecandidate
}, error);
}, error);
};
var answerCreate = function(descreption)
{
peerCon
.setRemoteDescription(descreption, function()
{
peerCon
.createAnswer(
function(description)
{
peerCon
.setLocalDescription(description, function()
{
//wait for complete of peerCon.onicecandidate
}, error);
}, error);
}, error);
};
var answerGet = function(description)
{
peerCon.setRemoteDescription(description, function()
{ //
console.log(JSON.stringify(description));
dlog('local-remote-setDescriptions complete!');
}, error);
};
var error = function(e)
{
console.log(e);
};
var dlog = function(msg)
{
var content = $('#onmsg')
.html();
$('#onmsg')
.html(content + msg + '<br>');
}
Firefox(26.0):
RtpDataChannels
onopen event is fired successfully, but send fails.
Chrome(31.0):
RtpDataChannels
onopen event is fired successfully, and send also succeeded.
A SDP object by Chrome is as follows:
{"sdp":".................. cname:L5dftYw3P3clhLve
\r\
na=ssrc:2410443476 msid:ch1 ch1
\r\
na=ssrc:2410443476 mslabel:ch1
\r\
na=ssrc:2410443476 label:ch1
\r\n","type":"offer"}
where the ch1 information defined in the code;
ch = peerCon.createDataChannel(
'ch1',
{
reliable: false
});
is bundled properly.
However, a SDP object (local description) by Firefox does not contain DataChannel at all, and moreover, the SDP is much shorter than Chrome, and less information bundled.
What do I miss?
Probably, I guess the reason that send fails on DataChannel is due to this lack of information in the SDP object by firefox.
How could I fix this?
I investigated sources of various working libraries, such as peerJS, easyRTC, simpleWebRTC, but cannot figure out the reason.
Any suggestion and recommendation to read is appreciated.
[not an answer, yet]
I leave this here just trying to help you. I am not much of a WebRTC developer. But, curious i am, this quite new and verry interresting for me.
Have you seen this ?
DataChannels
Supported in Firefox today, you can use DataChannels to send peer-to-peer
information during an audio/video call. There is
currently a bug that requires developers to set up some sort of
audio/video stream (even a “fake” one) in order to initiate a
DataChannel, but we will soon be fixing that.
Also, i found this bug hook, witch seems to be related.
One last point, your version of adapter.js is different from the one served on code.google. And .. alot. the webrtcDetectedVersion part is missing in yours.
https://code.google.com/p/webrtc/source/browse/stable/samples/js/base/adapter.js
Try that, come back to me with good newz. ?
After last updating, i have this line in console after clicking 'get answer'
Object { name="INVALID_STATE", message="Cannot set remote offer in
state HAVE_LOCAL_OFFER", exposedProps={...}, more...}
but this might be useless info ence i copy pasted the same browser offre to answer.
.. witch made me notice you are using jQuery v1.7.1 jquery.com.
Try updating jQuery (before i kill a kitten), and in the meantime, try make sure you use all updated versions of scripts.
Woups, after fast reading this : https://developer.mozilla.org/en-US/docs/Web/Guide/API/WebRTC/WebRTC_basics then comparing your javascripts, i see no SHIM.
Shims
As you can imagine, with such an early API, you must use the browser
prefixes and shim it to a common variable.
> var PeerConnection = window.mozRTCPeerConnection ||
> window.webkitRTCPeerConnection; var IceCandidate =
> window.mozRTCIceCandidate || window.RTCIceCandidate; var
> SessionDescription = window.mozRTCSessionDescription ||
> window.RTCSessionDescription; navigator.getUserMedia =
> navigator.getUserMedia || navigator.mozGetUserMedia ||
> navigator.webkitGetUserMedia;
I just downloaed and installed phantomjs on my machine. I copy and pasted the following script into a file called hello.js:
var page = require('webpage').create();
var url = 'https://www.google.com'
page.onLoadStarted = function () {
console.log('Start loading...');
};
page.onLoadFinished = function (status) {
console.log('Loading finished.');
phantom.exit();
};
page.open(url);
I'd like to print the complete html source (in this case from the google page) to a file or to the console. How do I do this?
Spent some time to read the documentation, it should be obvious afterwards.
var page = require('webpage').create();
page.open('http://google.com', function () {
console.log(page.content);
phantom.exit();
});