I am running a simple node script which starts chromedriver pointed at my website, scrolls to the bottom of the page, and writes the trace to trace.json.
This file is around 30MB.
I can't seem to load this file in chrome://tracing/, which is what I assume I would do in order to view the profile data.
What are my options for making sense of my trace.json file?
Here is my node script, in case that helps clarify what I am up to:
'use strict';
var fs = require('fs');
var wd = require('wd');
var b = wd.promiseRemote('http://localhost:9515');
b.init({
browserName: 'chrome',
chromeOptions: {
perfLoggingPrefs: {
'traceCategories': 'toplevel,disabled-by-default-devtools.timeline.frame,blink.console,disabled-by-default-devtools.timeline,benchmark'
},
args: ['--enable-gpu-benchmarking', '--enable-thread-composting']
},
loggingPrefs: {
performance: 'ALL'
}
}).then(function () {
return b.get('http://www.example.com');
}).then(function () {
// We only want to measure interaction, so getting a log once here
// flushes any previous tracing logs we have.
return b.log('performance');
}).then(function () {
// Smooth scroll to bottom.
return b.execute(`
var height = Math.max(document.documentElement.scrollHeight, document.body.scrollHeight, document.documentElement.clientHeight);
chrome.gpuBenchmarking.smoothScrollBy(height, function (){});
`);
}).then(function () {
// Wait for the above action to complete.
return b.sleep(5000);
}).then(function () {
// Get all the trace logs since last time log('performance') was called.
return b.log('performance');
}).then(function (data) {
// Write the file to disk.
return fs.writeFileSync('trace.json', JSON.stringify(data.map(function (s) {
return JSON.parse(s.message); // This is needed since Selenium outputs logs as strings.
})));
}).fin(function () {
return b.quit();
}).done();
Your script doesn't generate the correct format. The required data for each entry are located in message.message.params.
To generate a trace that can be loaded in chrome://tracing :
var fs = require('fs');
var webdriver = require('selenium-webdriver');
var driver = new webdriver.Builder()
.withCapabilities({
browserName : 'chrome',
loggingPrefs : { performance: 'ALL' },
chromeOptions : {
args: ['--enable-gpu-benchmarking', '--enable-thread-composting'],
perfLoggingPrefs: {
'traceCategories': 'toplevel,disabled-by-default-devtools.timeline.frame,blink.console,disabled-by-default-devtools.timeline,benchmark'
}
}
}).build();
driver.get('https://www.google.com/ncr');
driver.sleep(1000);
// generate a trace file loadable in chrome://tracing
driver.manage().logs().get('performance').then(function (data) {
fs.writeFileSync('trace.json', JSON.stringify(data.map(function (d) {
return JSON.parse(d['message'])['message']['params'];
})));
});
driver.quit();
The same script with python:
import json, time
from selenium import webdriver
driver = webdriver.Chrome(desired_capabilities = {
'loggingPrefs': { 'performance': 'ALL' },
'chromeOptions': {
"args" : ['--enable-gpu-benchmarking', '--enable-thread-composting'],
"perfLoggingPrefs" : {
"traceCategories": "toplevel,disabled-by-default-devtools.timeline.frame,blink.console,disabled-by-default-devtools.timeline,benchmark"
}
}
})
driver.get('https://stackoverflow.com')
time.sleep(1)
# generate a trace file loadable in chrome://tracing
with open(r"trace.json", 'w') as f:
f.write(json.dumps([json.loads(d['message'])['message']['params'] for d in driver.get_log('performance')]))
driver.quit()
Not sure if you know, the recommendation lib for parsing those thing is https://github.com/ChromeDevTools/devtools-frontend
Also, recommended categories are __metadata,benchmark,devtools.timeline,rail,toplevel,disabled-by-default-v8.cpu_profiler,disabled-by-default-devtools.timeline,disabled-by-default-devtools.timeline.frame,blink.user_timing,v8.execute,disabled-by-default-devtools.screenshot
It's very old question, but hope this helps new other guys.
Related
beautiful people on the internet. I am new to chrome extension not new to writing code though. I have implemented webpack to use external packages. One major one in my application is npm package by the name "mark.js".
My application works like this i want some specific words to be highlighted in active webpage using this package. I have written code for this to achieve the functionality but the problem is with loading the script in a web page. I have performed different aspect of loading script but that doesnot work. The new MV3 version have some strict rules.
I want to achieve anything similar of loading script in an active webpage. Please help.
btn.addEventListener("click", async () => {
console.log("BUTTON IS PRESSED!!");
try {
await chrome.tabs.query(
{ active: true, currentWindow: true },
async function (tabs) {
chrome.scripting.executeScript({
target: { tabId: tabs[0].id },
func: highlightText,
args: [arr],
});
}
);
} catch (e) {
console.log("ERROR AT CHROME TAB QUERY : ", e);
}
});
async function highlightText(arr) {
console.log(typeof Mark);
try {
var instance2 = new Mark(document.querySelector("body"));
// instance2.mark("is");
var success = [];
// const instance2 = new Mark(document.querySelector("body"));
await Promise.all(
arr.map(async function (obj) {
console.log("OBJECT TEXT : ", obj.text);
instance2.mark(obj.text, {
element: "span",
each: function (ele) {
console.log("STYLING : ");
ele.setAttribute("style", `background-color: ${obj.color};`);
if (obj.title && obj.title != "") {
ele.setAttribute("title", obj.title);
}
ele.innerHTML = obj.text;
success.push({
status: "Success",
text: obj.text,
});
},
});
})
);
console.log("SUCCESS : ", success);
} catch (error) {
console.log(error);
}
}
There's no need to use chrome.runtime.getURL. Since you use executeScript to run your code all you need is to inject mark.js before injecting the function.
Also, don't load popup.js in content_scripts, it's not a content script (these run in web pages), it's a script for your extension page. Actually, you don't need content_scripts at all.
btn.addEventListener('click', async () => {
const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
const target = { tabId: tab.id };
const exec = v => (await chrome.scripting.executeScript({ target, ...v }))[0].result;
if (!await exec({ func: () => !!window.Mark })) {
await exec({files: ['mark.js.min'] });
await exec({func: highlightText, args: [arr] });
}
});
For V3 I assume you will want to use Content Scripts in your manifest to inject the javascript into every webpage it matches. I recently open-sourced TorpedoRead and had to do both V2 and V3, I recommend checking the repo as it sounds like I did something similar to you (Firefox is V2, Chrome is V3).
The code below need to be added to your manifest.json and this will execute on every page based on the matches property. You can read more about content scripts here: https://developer.chrome.com/docs/extensions/mv3/content_scripts/
"content_scripts": [
{
"matches": ["<all_urls>"],
"js": ["yourscript.js"]
}
],
gulp.task('default', function(done) {
inquirer.prompt([{
type: `input`,
message: `Enter the path`,
default: `./admin/admin.json`,
name: `path`
}]).then(function(answers) {
console.log(answers.path);
console.log('answers');
mydefaultTaskTwo(null, answers.path).pipe(pipedFunction());
done();
})
});
function mydefaultTaskTwo(cb, path) {
let data = '';
try {
data = fs.readFileSync(path, 'utf-8');
} catch (e) {
console.log(`Error: ${e}`);
}
return data;
}
function pipedFunction() {
let object = JSON.parse(data);
object['main'] = 'admin';
data = JSON.stringify(object);
const readable = Readable.from(data)
return readable;
}
I understand that src returns a stream and pipe takes that stream and return a stream, but how do you feed in the stream into the pipedFunction called inside of pipe? I am unsure how it works. I get the following error:
ReferenceError: data is not defined.
Is there something I am misunderstanding about gulp scripts?
Basically you define data as a local scope-level variable and try to reach it from a different scope, where it's undefined. So, you need to make use of the fact that data is returned and pass it, like:
var data = mydefaultTaskTwo(null, answers.path);
data.pipe(pipedFunction(data));
It is perfectly valid to import from a URL inside an ES6 module and as such I've been using this technique to reuse modules between microservices that sit on different hosts/ports:
import { authInstance } from "http://auth-microservice/js/authInstance.js"
I'm approaching a release cycle and have started down my usual path of bundling to IIFEs using rollup. Rollup doesn't appear to support es6 module imports from URLs, I think it should as this is allowed in the spec :(
module-name
The module to import from. This is often a relative or absolute path name to the .js file containing the module. Certain bundlers may permit or require the use of the extension; check your environment. Only single quotes and double quotes Strings are allowed. (https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/import)
I've dug through the interwebs for an hour now and have come up with nothing. Has anybody seen a resolver similar to rollup-plugin-node-resolve for resolving modules from URLs?
I had to move on from this quickly so ended up just writing a skeleton of a rollup plugin. I still feel that resolving absolute paths should be a core feature of rollup.
Updated snippet
We have been using this to transpile production code for several of our apps for a considerable amount of time now.
const fs = require('fs'),
path = require('path'),
axios = require("axios")
const createDir = path => !fs.existsSync(path) && fs.mkdirSync(path)
const mirrorDirectoryPaths = async ({ cacheLocation, url }) => {
createDir(cacheLocation)
const dirs = [], scriptPath = url.replace(/:\/\/|:/g, "-")
let currentDir = path.dirname(scriptPath)
while (currentDir !== '.') {
dirs.unshift(currentDir)
currentDir = path.dirname(currentDir)
}
dirs.forEach(d => createDir(`${cacheLocation}${d}`))
return `${cacheLocation}${scriptPath}`
}
const cacheIndex = {}
const writeToDiskCache = async ({ cacheLocation, url }) => {
//Write a file to the local disk cache for rollup to pick up.
//If the file is already existing use it instead of writing a new one.
const cached = cacheIndex[url]
if (cached) return cached
const cacheFile = await mirrorDirectoryPaths({ cacheLocation, url }),
data = (await axiosInstance.get(url).catch((e) => { console.log(url, e) })).data
fs.writeFileSync(cacheFile, data)
cacheIndex[url] = cacheFile
return cacheFile
}
const urlPlugin = (options = { cacheLocation }) => {
return {
async resolveId(importee, importer) {
//We importing from a URL
if (/^https?:\/\//.test(importee)) {
return await writeToDiskCache({ cacheLocation: options.cacheLocation, url: importee })
}
//We are importing from a file within the cacheLocation (originally from a URL) and need to continue the cache import chain.
if (importer && importer.startsWith(options.cacheLocation) && /^..?\//.test(importee)) {
const importerUrl = Object.keys(cacheIndex).find(key => cacheIndex[key] === importer),
importerPath = path.dirname(importerUrl),
importeeUrl = path.normalize(`${importerPath}/${importee}`).replace(":\\", "://").replace(/\\/g, "/")
return await writeToDiskCache({ cacheLocation: options.cacheLocation, url: importeeUrl })
}
}
}
}
This plugin together with the following config works for me:
https://github.com/mjackson/rollup-plugin-url-resolve
import typescript from "#rollup/plugin-typescript";
import urlResolve from "rollup-plugin-url-resolve";
export default {
output: {
format: "esm",
},
plugins: [
typescript({ lib: ["es5", "es6", "dom"], target: "es5" }),
urlResolve(),
],
};
You can remove the TypeScript plugin obviously.
Being new to elasticsearch, am exploring it by integrating with node and trying to execute the following online git example in windows.
https://github.com/sitepoint-editors/node-elasticsearch-tutorial
while trying to import the data of 1000 items from data.json, the execution 'node index.js' is failing with the following error.
By enabling the trace, I now see the following as the root cause from the bulk function.
"error": "Content-Type header [application/x-ldjson] is not supported",
** "status": 406**
I see a change log from https://www.elastic.co/guide/en/elasticsearch/client/javascript-api/current/changelog.html which says following
13.0.0 (Apr 24 2017) bulk and other APIs that send line-delimited JSON bodies now use the Content-Type: application/x-ndjson header #507
Any idea how to resolve this content type issue in index.js?
index.js
(function () {
'use strict';
const fs = require('fs');
const elasticsearch = require('elasticsearch');
const esClient = new elasticsearch.Client({
host: 'localhost:9200',
log: 'error'
});
const bulkIndex = function bulkIndex(index, type, data) {
let bulkBody = [];
data.forEach(item => {
bulkBody.push({
index: {
_index: index,
_type: type,
_id: item.id
}
});
bulkBody.push(item);
});
esClient.bulk({body: bulkBody})
.then(response => {
console.log(`Inside bulk3...`);
let errorCount = 0;
response.items.forEach(item => {
if (item.index && item.index.error) {
console.log(++errorCount, item.index.error);
}
});
console.log(`Successfully indexed ${data.length - errorCount} out of ${data.length} items`);
})
.catch(console.err);
};
// only for testing purposes
// all calls should be initiated through the module
const test = function test() {
const articlesRaw = fs.readFileSync('data.json');
const articles = JSON.parse(articlesRaw);
console.log(`${articles.length} items parsed from data file`);
bulkIndex('library', 'article', articles);
};
test();
module.exports = {
bulkIndex
};
} ());
my local windows environment:
java version 1.8.0_121
elasticsearch version 6.1.1
node version v8.9.4
npm version 5.6.0
The bulk function doesn't return a promise. It accepts a callback function as a parameter.
esClient.bulk(
{body: bulkBody},
function(err, response) {
if (err) { console.err(err); return; }
console.log(`Inside bulk3...`);
let errorCount = 0;
response.items.forEach(item => {
if (item.index && item.index.error) {
console.log(++errorCount, item.index.error);
}
});
console.log(`Successfully indexed ${data.length - errorCount} out of ${data.length} items`);
}
)
or use promisify to convert a function accepting an (err, value) => ... style callback to a function that returns a promise.
const esClientBulk = util.promisify(esClient.bulk)
esClientBulk({body: bulkBody})
.then(...)
.catch(...)
EDIT: Just found out that elasticsearch-js supports both callbacks and promises. So this should not be an issue.
By looking at the package.json of the project that you've linked, it uses elasticsearch-js version ^11.0.1 which is an old version and that is sending requests with application/x-ldjson header for bulk upload, which is not supported by newer elasticsearch versions. So, upgrading elasticsearch-js to a newer version (current latest is 14.0.0) should fix it.
Is there any way to run Headless Chrome/Chromium in a Google Cloud Function? I understand I can include and run statically compiled binaries in GCF. Can I get a statically compiled version of Chrome that would work for this?
The Node.js 8 runtime for Google Cloud Functions now includes all the necessary OS packages to run Headless Chrome.
Here is a code sample of an HTTP function that returns screenshots:
Main index.js file:
const puppeteer = require('puppeteer');
exports.screenshot = async (req, res) => {
const url = req.query.url;
if (!url) {
return res.send('Please provide URL as GET parameter, for example: ?url=https://example.com');
}
const browser = await puppeteer.launch({
args: ['--no-sandbox']
});
const page = await browser.newPage();
await page.goto(url);
const imageBuffer = await page.screenshot();
await browser.close();
res.set('Content-Type', 'image/png');
res.send(imageBuffer);
}
and package.json
{
"name": "screenshot",
"version": "0.0.1",
"dependencies": {
"puppeteer": "^1.6.2"
}
}
I've just deployed a GCF function running headless Chrome. A couple takeways:
you have to statically compile Chromium and NSS on Debian 8
you have to patch environment variables to point to NSS before launching Chromium
performance is much worse than what you'd get on AWS Lambda (3+ seconds)
For 1, you should be able to find plenty of instructions online.
For 2, the code that I'm using is the following:
static executablePath() {
let bin = path.join(__dirname, '..', 'bin', 'chromium');
let nss = path.join(__dirname, '..', 'bin', 'nss', 'Linux3.16_x86_64_cc_glibc_PTH_64_OPT.OBJ');
if (process.env.PATH === undefined) {
process.env.PATH = path.join(nss, 'bin');
} else if (process.env.PATH.indexOf(nss) === -1) {
process.env.PATH = [path.join(nss, 'bin'), process.env.PATH].join(':');
}
if (process.env.LD_LIBRARY_PATH === undefined) {
process.env.LD_LIBRARY_PATH = path.join(nss, 'lib');
} else if (process.env.LD_LIBRARY_PATH.indexOf(nss) === -1) {
process.env.LD_LIBRARY_PATH = [path.join(nss, 'lib'), process.env.LD_LIBRARY_PATH].join(':');
}
if (fs.existsSync('/tmp/chromium') === true) {
return '/tmp/chromium';
}
return new Promise(
(resolve, reject) => {
try {
fs.chmod(bin, '0755', () => {
fs.symlinkSync(bin, '/tmp/chromium'); return resolve('/tmp/chromium');
});
} catch (error) {
return reject(error);
}
}
);
}
You also need to use a few required arguments when starting Chrome, namely:
--disable-dev-shm-usage
--disable-setuid-sandbox
--no-first-run
--no-sandbox
--no-zygote
--single-process
I hope this helps.
As mentioned in the comment, work is being done on a possible solution to running a headless browser in a cloud function. A directly applicable discussion:"headless chrome & aws lambda" can be read on Google Groups.
The question at. had was can you run headless chrome or chromium in Firebase Cloud Functions... the answer is NO! since the node.js project will not have access any chrome/chromium executables and therefore will fail! (TRUST ME - I've Tried!).
A better solutions is to use the Phantom npm package, which uses PhantomJS under the hood:
https://www.npmjs.com/package/phantom
Docs and info can be found here:
http://amirraminfar.com/phantomjs-node/#/
or
https://github.com/amir20/phantomjs-node
The site i was trying to crawl had implemented screen scraping software, the trick is to wait for the page to load by searching for expected string, or regex match, i.e. i do a regex for a , if you need a regex of any complexity made for you - get in touch at https://AppLogics.uk/ - starting at £5 (GPB).
here is a typescript snippet to make the http or https call:
const phantom = require('phantom');
const instance: any = await phantom.create(['--ignore-ssl-errors=yes', '--load-images=no']);
const page: any = await instance.createPage();
const status = await page.open('https://somewebsite.co.uk/');
const content = await page.property('content');
same again in JavaScript:
const phantom = require('phantom');
const instance = yield phantom.create(['--ignore-ssl-errors=yes', '--load-images=no']);
const page = yield instance.createPage();
const status = yield page.open('https://somewebsite.co.uk/');
const content = yield page.property('content');
Thats the easy bit! if its a static page your pretty much done and you can parse the HTML into something like the cheerio npm package: https://github.com/cheeriojs/cheerio - an implementation of core JQuery designed for servers!
However if it is a dynamically loading page, i.e. lazy loading, or even anti-scraping methods, you will need to wait for the page to update by looping and calling the page.property('content') method and running a text search or regex to see if your page has finished loading.
I have created a generic asynchronous function returning the page content (as a string) on success and throws an exception on failure or timeout. It takes as parameters the variables for the page, text (string to search for that indicates success), error (string to indicate failure or null to not check for error), and timeout (number - self explanatory):
TypeScript:
async function waitForPageToLoadStr(page: any, text: string, error: string, timeout: number): Promise<string> {
const maxTime = timeout ? (new Date()).getTime() + timeout : null;
let html: string = '';
html = await page.property('content');
async function loop(): Promise<string>{
async function checkSuccess(): Promise <boolean> {
html = await page.property('content');
if (!isNullOrUndefined(error) && html.includes(error)) {
throw new Error(`Error string found: ${ error }`);
}
if (maxTime && (new Date()).getTime() >= maxTime) {
throw new Error(`Timed out waiting for string: ${ text }`);
}
return html.includes(text)
}
if (await checkSuccess()){
return html;
} else {
return loop();
}
}
return await loop();
}
JavaScript:
function waitForPageToLoadStr(page, text, error, timeout) {
return __awaiter(this, void 0, void 0, function* () {
const maxTime = timeout ? (new Date()).getTime() + timeout : null;
let html = '';
html = yield page.property('content');
function loop() {
return __awaiter(this, void 0, void 0, function* () {
function checkSuccess() {
return __awaiter(this, void 0, void 0, function* () {
html = yield page.property('content');
if (!isNullOrUndefined(error) && html.includes(error)) {
throw new Error(`Error string found: ${error}`);
}
if (maxTime && (new Date()).getTime() >= maxTime) {
throw new Error(`Timed out waiting for string: ${text}`);
}
return html.includes(text);
});
}
if (yield checkSuccess()) {
return html;
}
else {
return loop();
}
});
}
return yield loop();
});
}
I have personally used this function like this:
TypeScript:
try {
const phantom = require('phantom');
const instance: any = await phantom.create(['--ignore-ssl-errors=yes', '--load-images=no']);
const page: any = await instance.createPage();
const status = await page.open('https://somewebsite.co.uk/');
await waitForPageToLoadStr(page, '<div>Welcome to somewebsite</div>', '<h1>Website under maintenance, try again later</h1>', 1000);
} catch (error) {
console.error(error);
}
JavaScript:
try {
const phantom = require('phantom');
const instance = yield phantom.create(['--ignore-ssl-errors=yes', '--load-images=no']);
const page = yield instance.createPage();
yield page.open('https://vehicleenquiry.service.gov.uk/');
yield waitForPageToLoadStr(page, '<div>Welcome to somewebsite</div>', '<h1>Website under maintenance, try again later</h1>', 1000);
} catch (error) {
console.error(error);
}
Happy crawling!