Recently, I needed a way to convert JavaScript escaped characters to HTML/XML entities for saving files in UTF-8 encoding without saving the special characters to disk. Luckily this was pretty easy. I used JavaScript's built-in function escape() to escape the input text and used regular expressions to find each escaped value and replace them in the input text with their new HTML/XML entity equivalent. For example, the text "Copyright Some Company ©" would escape to "Copyright%20Some%20Company%20%A9" using JavaScript's built-in function escape(). After converting the escaped characters to entities, the result would be "Copyright Some Company ©". When saved in a database and rendered as HTML to a webpage, it would display the original message "Copyright Some Company ©". Here's the code for anyone interested:
// convert escaped characters to html/xml entities
function normalizeText(text)
{
// escape text for special characters
var esc = escape(text);
var simpleCharsRegex = /%[0-9A-Fa-f][0-9A-Fa-f]/g;
var specialCharsRegex = /%u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]/g;
var simpleCharsMatch = esc.match(simpleCharsRegex);
var specialCharsMatch = esc.match(specialCharsRegex);
if(simpleCharsMatch!=null)
{
for(var i = 0; i < simpleCharsMatch.length; i++)
{
var temp = simpleCharsMatch[i];
simpleCharsMatch[i] = simpleCharsMatch[i].replace(/%/, 'x') + ';';
esc = esc.replace(temp, simpleCharsMatch[i]);
}
}
if(specialCharsMatch!=null)
{
for(var i = 0; i < specialCharsMatch.length; i++)
{
var temp = specialCharsMatch[i];
specialCharsMatch[i] = specialCharsMatch[i].replace(/%u/, 'x') + ';';
esc = esc.replace(temp, specialCharsMatch[i]);
}
}
return esc;
}
I understand there may be a more efficient way to do this, but this code does work in a pinch. Please let me know in the comments if you have any suggestions for improving this code.
Recently I needed a way to parse command line options with PhantomJS. I didn't see anything else on the web that allowed for abitrary ordering of command line arguments to PhantomJS scripts so I made my own. Here's the code for those interested:
// argument results
var a1, a2, a3, a4;
function optionParser() {
var opt = 0;
while((opt < phantom.args.length) && (phantom.args[opt][0]=='-')) {
var sw = phantom.args[opt];
switch(sw) {
case '-a1':
opt++;
a1 = phantom.args[opt];
break;
case '-a2':
opt++;
a2 = phantom.args[opt];
break;
case '-a3':
opt++;
a3 = phantom.args[opt];
break;
case '-a4':
opt++;
a4 = phantom.args[opt];
break;
default:
console.log('Unknown switch: ' + phantom.args[opt]);
phantom.exit();
break;
}
opt++;
}
}
This can easily be modified to work with an array of argument results or you can simply read in each argument into its own variable. Also, you can read in integers and in your application logic, use isNaN() to check if the input is a valid integer.
Recently, I needed a way to pass dynamic content to and from webpages using PhantomJS as part of writing my screen scraper. I need the scraper to follow dynamic sets of links and scrape the data from each page. Since a webpage's scope is currently sand boxed, I had to find a way to pass data to and from webpages. With the addition of the new filesystem module in PhantomJS 1.3, it is now possible to pass data from the main scope to an individual page's scope. Any data that you want passed to a particular page should be saved as a javascript string to a javascript file. Then you can inject the javascript into the page on page.onLoadFinished so that the data is then accessible within the page's scope. For example:
var page = require('webpage').create(),
fs = require('fs'),
data = "var dataObject = { item: 'value' };",
fullpath;
fullpath = fs.workingDirectory + fs.separator + 'data.js';
// open file for writing
var dataFile = fs.open(fullpath, 'w');
dataFile.write(data);
dataFile.close();
// check that the file was successfully written
if(fs.size(fullpath) > 0) {
console.log('File wrote successfully!');
page.open('http://somesite.org/page.html');
// put page data in a local variable
var output = page.evaluate(function () {
// print the output of the data object
console.log(dataObject.item);
return dataObject.item;
});
// output should be the same value as the page's dataObject.item
console.log(output);
}
else {
console.log('Error in writing the file!');
phantom.exit();
}
page.onLoadFinished = function() {
// inject the javascript data that we created earlier
page.injectJS(fullpath);
}
For more information about PhantomJS' File System module, please visit: http://code.google.com/p/phantomjs/wiki/Interface#Filesystem_Module
While this solution may not be the best long term solution, it does provide a way to get data to and from your pages until official support for passing data to a webpage object becomes available in PhantomJS.
Recently I came across a question on stackoverflow that asked about how to take screenshots of all HTML files in a local folder. I have been playing with PhantomJS quite a bit lately and felt comfortable answering the question. Here is the code for those interested:
var page = require('webpage').create(), loadInProgress = false, fs = require('fs');
var htmlFiles = new Array();
console.log('working directory: ' + fs.workingDirectory);
var curdir = fs.list(fs.workingDirectory);
// loop through files and folders
for(var i = 0; i< curdir.length; i++)
{
var fullpath = fs.workingDirectory + fs.separator + curdir[i];
// check if item is a file
if(fs.isFile(fullpath))
{
if(fullpath.indexOf('.html') != -1)
{
// show full path of file
console.log('File path: ' + fullpath);
htmlFiles.push(fullpath);
}
}
}
console.log('Number of Html Files: ' + htmlFiles.length);
// output pages as PNG
var pageindex = 0;
var interval = setInterval(function() {
if (!loadInProgress && pageindex < htmlFiles.length) {
console.log("image " + (pageindex + 1));
page.open(htmlFiles[pageindex]);
}
if (pageindex == htmlFiles.length) {
console.log("image render complete!");
phantom.exit();
}
}, 250);
page.onLoadStarted = function() {
loadInProgress = true;
console.log('page ' + (pageindex + 1) + ' load started');
};
page.onLoadFinished = function() {
loadInProgress = false;
page.render("images/output" + (pageindex + 1) + ".png");
console.log('page ' + (pageindex + 1) + ' load finished');
pageindex++;
}
The process is quite simple. First, I loop through all objects in the current working directory and check to see if each item is a file and whether it has the .html extension. Then I add each html file's filepath to an array that I later loop through to take the screenshots. A screenshot must be taken after the page is fully loaded so that the screenshot will contain actual content and not a blank image. This is done by saving the image on the page.onLoadFinished callback. The application loop for taking the screenshots inserts small 250ms delays between each request so that pages may fully load into the headless browser before advancing to the next page.