< Day Day Up > |
Turn the Web into the ultimate cross-referenced library. Stefan Magdalinski of Whitelabel.org (http://www.whitelabel.org) created a bit of a stir with his WikiProxy, which added links to the BBC's news articles that pointed to pages in the online encyclopedia Wikipedia (http://www.wikipedia.org). The proxy worked by reading in a BBC page, extracting candidates for linking using specially tailored regular expressions, and then comparing these candidates to a list of phrases from the Wikipedia database. This raises the possibility of extending this functionality beyond the BBC site. It's not feasible to proxy the entire Web (unless you're Google), but it sounds like a perfect task for a Greasemonkey script. One big problem: you need to check the term candidates against the Wikipedia database, which weighs in at a hefty 18 megabytes for the article titles alone. Stefan, author of the original WikiProxy, has kindly agreed to make the Wikipedia term lookup accessible as a web service. This hack uses his web service to look up possible Wikipedia entries and adds links to the current page based on the keyword lookup.
11.5.1. The CodeThis user script runs on all pages. It is quite complex, but it breaks down into five steps:
To minimize the load on Stefan's keyword lookup service, we use an associative array, usedTerms, to keep track of which term candidates have been found on the page. This saves time and bandwidth by ensuring that each potential keyword is checked only once. Save the following user script as wikipedia-proxy.user.js: // ==UserScript== // @name Wikiproxy: Greasemonkey Edition // @namespace http://www.allpeers.com/blog/greasemonkey // @description Adds Wikipedia links to key terms in webpages // @include http://* // @exclude http://wikiproxy.whitelabel.org/* // @exclude http://www.theyworkforyou.com/* // @exclude http://*.wikipedia.tld/* // ==/UserScript== // based on code by Matthew Gertner, Valentin Laube, and others // and included here with their gracious permission var iconcolor = 0; // 0 blue, 1 green, 2 red var icons = [ "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKAgMAAADwXCcu"+ "AAAADFBMVEUWJgGkyP%2F%2F%2F%2F8AZv87Gt1vAAAAAXRSTlMAQObYZgAAAC1JRE"+ "FUCNdjYP7%2FgYF51QYG%2Fv8bGHhXb2BgXg2hzco%2FMNw8z8BgzsDAwMPAAAAtcQ"+ "zEPgrvTwAAAABJRU5ErkJggg%3D%3D", "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKAgMAAADwXCcu"+ "AAAADFBMVEUbHgGW0nL%2F%2F%2F9SqxuFak3hAAAAAXRSTlMAQObYZgAAAC1JREFU"+ "CNdjYP7%2FgYF51QYG%2Fv8bGHhXb2BgXg2hzco%2FMNw8z8BgzsDAwMPAAAAtcQzE"+ "PgrvTwAAAABJRU5ErkJggg%3D%3D", "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKAgMAAADwXCcu"+ "AAAADFBMVEUbHgH%2Fq47%2F%2F%2F%2F%2FVBnJ4fJlAAAAAXRSTlMAQObYZgAAAC"+ "1JREFUCNdjYP7%2FgYF51QYG%2Fv8bGHhXb2BgXg2hzco%2FMNw8z8BgzsDAwMPAAA"+ "AtcQzEPgrvTwAAAABJRU5ErkJggg%3D%3D" ]; var icons2 = [ "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs%2"+ "B9AAAAeElEQVQY02NkQAIB0%2F7%2FZ8ABWNAF1mdiKgqczsDASMi09ZkQhSgmFjoz"+ "MNipoZoEA0wwxoYsRsb%2BvQw4AROy1YXOuBUyIyuyU2NgOHSLgaFgFQODFD8Dw4n7"+ "DAwrzyCZdPDm%2F%2F%2F%2F%2F0NomKcCpiHYKOGHrAgvIKQIAECSPtEmaizfAAAA"+ "AElFTkSuQmCC", "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs%2"+ "B9AAAAi0lEQVQY02NkQAIdR5P%2FM%2BAALOgC5VZzMBR1HkthYCRkWrnVHIbOYymo"+ "JupLWzN4yCeimAQDTDBGhfVcxotPj%2BJyIkJhx9Hk%2F%2FrS1jgVMiIr8pBPZNjx"+ "cD7DxadHGfSlrRlQbOg4mvx%2F%2B4N5%2F%2F%2F%2F%2F%2F9%2F%2B4N5%2F2Ge"+ "6jia%2FB%2FDgzDF%2BMKRqMBmYGBgAAApFkF%2BHyXzbAAAAABJRU5ErkJggg%3D%"+ "3D", "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs%2"+ "B9AAAAf0lEQVQY042QyxGCQBBE36BRQQ4UKSw5QA4sOeDVo5gDxOCdQNqDfN1asS8z"+ "NfWqa7qNneSdiOgaXKoupNoSO3WrOmjLL8esgDQ%2FOC1KlsXqmzE8Yi9uoLwTWREF"+ "LwcozWF8wr0BM5heMPabk4Zekj5zDiXvFARc4R89%2FlU2wBsinj50jQNjuAAAAABJ"+ "RU5ErkJggg%3D%3D" ]; var bgprefix = "url(" var bgsuffix = ") center right no-repeat"; var requestUrl = "http://wikiproxy.whitelabel.org/xml.php"; var wikipediaUrlPrefix = "http://en.wikipedia.org/wiki/"; var excludeAncestors = ["a", "script", "style", "input", "textarea", "select", "option"]; var excludeXPath = "ancestor::*["; for (var tagNum=0; tagNum<excludeAncestors.length; tagNum++) excludeXPath += (tagNum == 0 ? "" : " or ") + "self::" + excludeAncestors[tagNum]; excludeXPath += "]"; // Regular expression definitions from News Wikiproxy var capsword = "A|[A-Z][a-zA-Z'0-9]{1,}"; var fillerwords = "a|of|and|in|on|under|the"; var middlewordre = "(" + capsword + "|" + fillerwords + "|[A-Z]\.)[ \\t]*"; var endwordre = "(" + capsword + ")[ \\t]*"; var acronymre = "\\b([A-Z][A-Z0-9]{2,})\\b"; // Match either "Two Endwords" or "Endword and Some Middle Words" var greedyproperre = "\\b(" + endwordre + "(" + middlewordre + ")*" + endwordre + ")\\b"; // Match without filler words (so if you have a phrase like // "Amnesty International and Human Rights Watch" you also get both parts // separately "Amnesty International" and "Human Rights Watch") var frugalproperre = "\\b((" + endwordre + "){2,})\\b"; var usedTerms = new Object(); function addWikiLinkStyle() { var wikiLinkStyle = document.createElement('style'); wikiLinkStyle.id = "wikilinkstyle"; wikiLinkStyle.type = "text/css"; wikiLinkStyle.innerHTML = '.wikilink, .wikilink_over {\n' + 'color: inherit;\n' + 'padding-right: 13px;\n' + '}\n' + '.wikilink {\n' + 'background: transparent ' + bgprefix + icons[iconcolor] + bgsuffix + ';\n' + '}\n' + '.wikilink_over {\n' + 'background: transparent ' + bgprefix + icons2[iconcolor] + bgsuffix + ';\n' + '}'; document.getElementsByTagName('head')[0].appendChild(wikiLinkStyle); } function getTerms(str, regexpstr, terms) { var candidates = str.match(new RegExp(regexpstr, "mg")); for (var i=0; i<candidates.length; i++) { var term = candidates[i]; while (term.charAt(term.length-1) == " ") term = term.substring(0, term.length-1); if (usedTerms[term] == null) { if (terms.length > 0) { terms += " "; } terms += term.replace(/ /g, "_"); usedTerms[term] = term; } } return terms; } if (document.documentElement.tagName == "HTML") { var treeWalker = document.createTreeWalker( document.documentElement, NodeFilter.SHOW_TEXT, null, false); var text = ""; var textNode; while (textNode = treeWalker.nextNode()) { if (!document.evaluate("ancestor::script", textNode, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue) { text += textNode.nodeValue + "\n"; } } var terms = getTerms(text, greedyproperre, ""); terms = getTerms(text, frugalproperre, terms); terms = getTerms(text, acronymre, terms); GM_xmlhttpRequest({ method: 'POST', url: requestUrl, headers: { 'User-agent': 'Mozilla/4.0 (compatible) Greasemonkey', 'Content-type': 'application/x-www-form-urlencoded' }, data: 'text=' + escape(terms), onload: function(responseDetails) { var parser = new DOMParser(); var responseXML = parser.parseFromString( responseDetails.responseText, "text/xml"); var termSnapshot = document.evaluate("/wikiproxy/term/text()", responseXML, null, XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null); var normalizedTerms = new Object(); var termRegExp = ""; for (var i=0; i<termSnapshot.snapshotLength; i++) { var termNodeValue = termSnapshot.snapshotItem(i). nodeValue.replace(/_/g, " "); normalizedTerms[termNodeValue.toLowerCase()] = termNodeValue; if (termRegExp.length > 0) { termRegExp += "|"; } termRegExp += termNodeValue; } termRegExp = new RegExp("\\b(" + termRegExp + ")\\b", "mg"); treeWalker = document.createTreeWalker( document.documentElement, NodeFilter.SHOW_TEXT, null, false); while (textNode = treeWalker.nextNode()) { if (responseXML.evaluate(excludeXPath, textNode, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue) { continue; } var matches = textNode.nodeValue.match(termRegExp); if (!matches) { continue; } // add wiki link style if (!document.getElementById ('wikilinkstyle')) { addWikiLinkStyle(); } for (i=0; i<matches.length; i++) { var term = matches[i]; if(!term)continue; var displayTerm = term.replace(/_/g, " "); term = normalizedTerms[term.toLowerCase()]; var termIndex = textNode.nodeValue.indexOf(displayTerm); var preTermNode = document.createTextNode( textNode.nodeValue.substring(0, termIndex)); textNode.nodeValue = textNode.nodeValue.substring( termIndex+displayTerm.length); var anchor = document.createElement("a"); anchor.className = "wikilink"; anchor.addEventListener('mousemove', function () { this.className = 'wikilink_over'; }, true); anchor.addEventListener('mouseout', function () { this.className = 'wikilink'; }, true); anchor.href = wikipediaUrlPrefix + term; var termNode = document.createTextNode(displayTerm); anchor.insertBefore(termNode, anchor.firstChild); textNode.parentNode.insertBefore(preTermNode, textNode); textNode.parentNode.insertBefore(anchor, textNode); } } } }); } function undoWikify() { var wlinks = document.evaluate('//a[@]', document, null, XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null); for (var i = 0; i < wlinks.snapshotLength; i++) { var wlink = wlinks.snapshotItem(i); var text = document.createTextNode(wlink.textContent); wlink.parentNode.replaceChild(text, wlink); } } GM_registerMenuCommand('Undo Wikify', undoWikify); 11.5.2. Running the HackAfter installing the script (Tools Install This User Script), load http://www.cia.gov/cia/publications/factbook/geos/uk.html. The script converts all words on the page that have Wikipedia entries into links, decorated with a Wikipedia icon, as shown in Figure 11-6. We take care not to change text that was already linked in the original page. Matthew Gertner Figure 11-6. Wikified CIA Factbook |
< Day Day Up > |