find common words of two webpages on the fly - html

I have a list of species here:
http://megasun.bch.umontreal.ca/ogmp/projects/other/compare.html
And a list of species here:
http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3524
I would like to find all species that are mentioned on BOTH pages. How can I do this quickly? (I dont mind if words not referring to species are to be found. I want to do comparision of words in general:)
Thanks for suggestions.

On each page in a console, do:
var html = document.body.innerHTML;
results = [];
html.match(/>([^<]+?)</g) // grab all values like ">...<"
.map(function(match) { // look for a long words..words..words
return match.match(/\w.*\w/);
})
.filter(function(match) { // ignore empty matches
return match!==null
})
.forEach(function(match) {
var text = match[0];
if (!text.match(/[0-9]/) && // ignore matches with numbers
results.indexOf(text)==-1) // add to results if not duplicate
results.push(text);
});
JSON.stringify(results);
Then do:
var page1 = JSON.parse(' /*COPY-PASTE THE RESULT OF PAGE 1*/ ');
var page2 = JSON.parse(' /*COPY-PASTE THE RESULT OF PAGE 2*/ ');
page1.map(function(s){return page2.indexOf(s)!=-1});
This is necessary to circumvent browser restrictions.
Demo:
> JSON.stringify( page1.filter(function(s){return page2.indexOf(s)!=-1}) )
'["Beta vulgaris","Spinacia oleracea"]'

Related

Partial replace in docs what matches only and preserve formatting

Let's assume that we have first paragraph in our google document:
Wo1rd word so2me word he3re last.
We need to search and replace some parts of text but it must be highlighted in editions history just like we changed only that parts and we must not loose our format (bold, italic, color etc).
What i have/understood for that moment: capturing groups didn't work in replaceText() as described in documentation. We can use pure js replace(), but it can be used only for strings. Our google document is array of objects, not strings. So i did a lot of tries and stopped at that code, attached in this message later.
Can't beat: how i can replace only part of what i've found. Capturing groups is very powerful and suitable instrument, but i can't use it for replacement. They didn't work or i can replace whole paragraph, that is unacceptable because of editions history will show full paragraph replace and paragraphs will lose formatting. What if what we searching will be in each and every paragraph, but only one letter must be changed? We will see full document replacement in history and it will be hard to find what really changed.
My first idea was to compare strings, that replace() gives to me with contents of paragraph then compare symbol after symbol and replace what is different, but i understand, that it will work only if we are sure that only one letter changed. But what if replace will delete/add some words, how it can be synced? It will be a lot bigger problem.
All topics that i've found and read triple times didn't helped and didn't moved me from the dead point.
So, is there any ideas how to beat that problem?
function RegExp_test() {
var docParagraphs = DocumentApp.getActiveDocument().getBody().getParagraphs();
var i = 0, text0, text1, test1, re, rt, count;
// equivalent of .asText() ???
text0 = docParagraphs[i].editAsText(); // obj
// equivalent of .editAsText().getText(), .asText().getText()
text1 = docParagraphs[i].getText(); // str
if (text1 !== '') {
re = new RegExp(/(?:([Ww]o)\d(rd))|(?:([Ss]o)\d(me))|(?:([Hh]e)\d(re))/g); // v1
// re = new RegExp(/(?:([Ww]o)\d(rd))/); // v2
count = (text1.match(re) || []).length; // re v1: 7, re v2: 3
if (count) {
test1 = text1.match(re); // v1: ["Wo1rd", "Wo", "rd", , , , , ]
// for (var j = 0; j < count; j++) {
// test1 = text1.match(re)[j];
// }
text0.replaceText("(?:([Ww]o)\\d(rd))", '\1-A-\2'); // GAS func
// #1: \1, \2 etc - didn't work: " -A- word so2me word he3re last."
test1 = text0.getText();
// js func, text2 OK: "Wo1rd word so-B-me word he3re last.", just in memory now
text1 = text1.replace(/(?:([Ss]o)\d(me))/, '$1-B-$2'); // working with str, not obj
// rt OK: "Wo1rd word so-B-me word he-C-re last."
rt = text1.replace(/(?:([Hh]e)\d(re))/, '$1-C-$2');
// #2: we used capturing groups ok, but replaced whole line and lost all formatting
text0.replaceText(".*", rt);
test1 = text0.getText();
}
}
Logger.log('Test finished')
}
Found a solution. It's a primitive enough but it can be a base for a more complex procedure that can fix all occurrences of capture groups, detect them, mix them etc. If someone wants to improve that - you are welcome!
function replaceTextCG(text0, re, to) {
var res, pos_f, pos_l;
var matches = text0.getText().match(re);
var count = (matches || []).length;
to = to.replace(/(\$\d+)/g, ',$1,').replace(/^,/, '').replace(/,$/, '').split(",");
for (var i = 0; i < count; i++) {
res = re.exec(text0.getText())
for (var j = 1; j < res.length - 1; j++) {
pos_f = res.index + res[j].length;
pos_l = re.lastIndex - res[j + 1].length - 1;
text0.deleteText(pos_f, pos_l);
text0.insertText(pos_f, to[1]);
}
}
return count;
}
function RegExp_test() {
var docParagraphs = DocumentApp.getActiveDocument().getBody().getParagraphs();
var i = 0, text0, count;
// equivalent of .asText() ???
text0 = docParagraphs[i].editAsText(); // obj
if (text0.getText() !== '') {
count = replaceTextCG(text0, /(?:([Ww]o)\d(rd))/g, '$1A$2');
count = replaceTextCG(text0, /(?:([Ss]o)\d(me))/g, '$1B$2');
count = replaceTextCG(text0, /(?:([Hh]e)\d(re))/g, '$1C$2');
}
Logger.log('Test finished')
}

Is there a simple way to have a local webpage display a variable passed in the URL?

I am experimenting with a Firefox extension that will load an arbitrary URL (only via HTTP or HTTPS) when certain conditions are met.
With certain conditions, I just want to display a message instead of requesting a URL from the internet.
I was thinking about simply hosting a local webpage that would display the message. The catch is that the message needs to include a variable.
Is there a simple way to craft a local web page so that it can display a variable passed to it in the URL? I would prefer to just use HTML and CSS, but adding a little inline javascript would be okay if absolutely needed.
As a simple example, when the extension calls something like:
folder/messageoutput.html?t=Text%20to%20display
I would like to see:
Message: Text to display
shown in the browser's viewport.
You can use the "search" property of the Location object to extract the variables from the end of your URL:
var a = window.location.search;
In your example, a will equal "?t=Text%20to%20display".
Next, you will want to strip the leading question mark from the beginning of the string. The if statement is just in case the browser doesn't include it in the search property:
var s = a.substr(0, 1);
if(s == "?"){s = substr(1);}
Just in case you get a URL with more than one variable, you may want to split the query string at ampersands to produce an array of name-value pair strings:
var R = s.split("&");
Next, split the name-value pair strings at the equal sign to separate the name from the value. Store the name as the key to an array, and the value as the array value corresponding to the key:
var L = R.length;
var NVP = new Array();
var temp = new Array();
for(var i = 0; i < L; i++){
temp = R[i].split("=");
NVP[temp[0]] = temp[1];
}
Almost done. Get the value with the name "t":
var t = NVP['t'];
Last, insert the variable text into the document. A simple example (that will need to be tweaked to match your document structure) is:
var containingDiv = document.getElementById("divToShowMessage");
var tn = document.createTextNode(t);
containingDiv.appendChild(tn);
getArg('t');
function getArg(param) {
var vars = {};
window.location.href.replace( location.hash, '' ).replace(
/[?&]+([^=&]+)=?([^&]*)?/gi, // regexp
function( m, key, value ) { // callback
vars[key] = value !== undefined ? value : '';
}
);
if ( param ) {
return vars[param] ? vars[param] : null;
}
return vars;
}

How do I find the page number/number of pages in a document?

I want to create a new document based on a template and need to know when my insertion or append results in a new page in the final printed output is there any property/attribute eg number of pages that can be used for this?
I've search this a lot in the past and I don't think there's any property or any other way to know page info.
The solution I use is to insert page breaks on my template or via the script, using my own knowledge of how my template works, i.e. how much space it takes as I iterate, etc.
And then I know which page I am by counting the page breaks.
Anyway, you could an enhancement request on the issue tracker.
One way to get total number of pages:
function countPages() {
var blob = DocumentApp.getActiveDocument().getAs("application/pdf");
var data = blob.getDataAsString();
var re = /Pages\/Count (\d+)/g;
var match;
var pages = 0;
while(match = re.exec(data)) {
Logger.log("MATCH = " + match[1]);
var value = parseInt(match[1]);
if (value > pages) {
pages = value;
}
}
Logger.log("pages = " + pages);
return pages;
}

What's the fastest way to search a very long list of words for a match in actionscript 3?

So I have a list of words (the entire English dictionary).
For a word matching game, when a player moves a piece I need to check the entire dictionary to see if the the word that the player made exists in the dictionary. I need to do this as quickly as possible. simply iterating through the dictionary is way too slow.
What is the quickest algorithm in AS3 to search a long list like this for a match, and what datatype should I use? (ie array, object, Dictionary etc)
I would first go with an Object, which is a hash table (at least, storage-wise).
So, for every word in your list, make an entry in your dictionary Object and store true as its value.
Then, you just have to check if a given word is a key into your dictionary to know whether the word the user has choosen is valid or not.
This works really fast in this simple test (with 10,000,000 entries):
var dict:Object = {};
for(var i:int = 0; i < 10000000; i++) {
dict[i] = true;
}
var btn:Sprite = new Sprite();
btn.graphics.beginFill(0xff0000);
btn.graphics.drawRect(0,0,50,50);
btn.graphics.endFill();
addChild(btn);
btn.addEventListener(MouseEvent.CLICK,checkWord);
var findIt:Boolean = true;
function checkWord(e:MouseEvent):void {
var word:String;
if(findIt) {
word = "3752132";
} else {
word = "9123012456";
}
if(dict[word]) {
trace(word + " found");
} else {
trace(word + " not found");
}
findIt = !findIt;
}
It takes a little longer to build the dictionary, but lookup is almost instantaneous.
The only caveat is that you will have to consider certain keys that will pass the check and not necessarily be part of your words list. Words such as toString, prototype, etc. There are just a few of them, but keep that in mind.
I would try something like this with your real data set. If it works fine, then you have a really easy solution. Go have a beer (or whatever you prefer).
Now, if the above doesn't really work after testing it with real data (notice I've build the list with numbers cast as strings for simplicity), then a couple of options, off the top of my head:
1) Partition the first dict into a set of dictionaries. So, instead of having all the words in dict, have a dictionary for words that begin with 'a', another for 'b', etc. Then, before looking up a word, check the first char to know where to look it up.
Something like:
var word:String = "hello";
var dictKey:String = word.charAt(0);
// actual check
if(dict[dictKey][word]) {
trace("found");
} else {
trace("not found");
}
You can eventually repartition if necessary. I.e, make dict['a'] point to another set of dictionaries indexed by the first two characters. So, you'll have dict['a']['b'][wordToSearch]. There are a number of possible variations on this idea (you'd also have to come up with some strategy to cope with words of two letters, such as "be", for instance).
2) Try a binary search. The problem with it is that you'll first have to sort the list, upfront. You have to do it just once, as it doesn't make sense to remove words from your dict. But with millions of words, it might be rarther intensive.
3) Try some fancy data structures from open source libraries such as:
http://sibirjak.com/blog/index.php/collections/as3commons-collections/
http://lab.polygonal.de/ds/
But again, as I said above, I'd first try the easiest and simpler solution and check if it works against the real data set.
Added
A simple way to deal with keywords used for Object's built-in properties:
var dict:Object = {};
var keywordsInDict:Array = [];
function buildDictionary():void {
// let's assume this is your original list, retrieved
// from XML or other external means
// it contains "constructor", which should be dealt with
// separately, as it's a built-in prop of Object
var sourceList:Array = ["hello","world","foo","bar","constructor"];
var len:int = sourceList.length;
var word:String;
// just a dummy vanilla object, to test if a word in the list
// is already in use internally by Object
var dummy:Object = {};
for(var i:int = 0; i < len; i++) {
// also, lower-casing is a good idea
// do that when you check words as well
word = sourceList[i].toLowerCase();
if(!dummy[word]) {
dict[i] = true;
} else {
// it's a keyword, so store it separately
keywordsInDict.push(word);
}
}
}
Now, just add an extra check for built-in props in the checkWords function:
function checkWord(e:MouseEvent):void {
var word:String;
if(findIt) {
word = "Constructor";
} else {
word = "asdfds";
}
word = word.toLowerCase();
var dummy:Object = {};
// check first if the word is a built-in prop
if(dummy[word]) {
// if it is, check if that word was in the original list
// if it was present, we've stored it in keywordsInDict
if(keywordsInDict.indexOf(word) != -1) {
trace(word + " found");
} else {
trace(word + " not found");
}
// not a built-in prop, so just check if it's present in dict
} else {
if(dict[word]) {
trace(word + " found");
} else {
trace(word + " not found");
}
}
findIt = !findIt;
}
This isn't specific to ActionScript, but a Trie is a suitable data structure for storing words.

xul-Get selection html

I have the following function that is supposed to get HTMLs for the user selected area on the web page. This function does not seems to work properly.
Sometime, it gets htmls which is not selected also.
Can anyone please look into this function? -- Thanks a lot.
//----------------------------Get Selected HTML------------------------
function getSelectionHTML(){
if (window.getSelection)
{
var focusedWindow = document.commandDispatcher.focusedWindow;
var sel = focusedWindow.getSelection();
var html = "";
var r = sel.getRangeAt(0);
var parent_element = r.commonAncestorContainer;
var prev_html = parent_element.innerHTML;
if(prev_html != undefined)
{
return prev_html;
}
return sel;
}
return null;
}
It looks to me like you're getting the contents of the parent element rather than the selection itself. If the parent element contains anything other than what you have selected, then you'll get that too.
var sel = focusedWindow.getSelection();
This line returns a selection object. It contains the exact text selected by the user. You then get the range from the selection and get the commonAncestorContainer. So if you have code like this:
<div id="ancestor">
<p>First sentence.</p>
<p>Another sentence.</p>
</div>
And your user selects from the 's' of the first sentence to the 's' of the second sentence then the commonAncestorContainer is the div element so you'll also get the rest of the text.
A good reason for this would be if you wanted to guarantee yourself a valid HTML fragment (this seems to be the case, implied by your function name), but if you just want the selected text then call the toString method on the range directly:
var focusedWindow = document.commandDispatcher.focusedWindow;
var sel = focusedWindow.getSelection();
var r = sel.getRangeAt(0);
return r.toString();