by Eric Giguere
March 4, 2003
Download the source: webscraper.zip, quote.zip
On occasion, you may find that the only way to obtain certain information is to extract it from a page of HTML on a web site, in a procedure commonly called page scraping. It's similar to extracting data from an XML document, but HTML's syntax is much looser than XML's, so you can't use an XML parser to parse HTML pages. The code presented here parses an HTML page inside a servlet or directly from a Mobile Information Device Profile (MIDP) application.
The first thing you need to do is define some helper classes to hold the various parts of an HTML page. You're only interested in three kinds of data: tags, comments, and text. Because they're all page elements, you define Element to serve as a base class:
// Marker class for page elements.
package webscraper;
public class Element {
}
|
The Text class is the simplest subclass:
// Defines a page element for holding text
// outside of any tag or comment.
package webscraper;
public class Text extends Element {
private String text;
public Text( String text ){
this.text = text;
}
public String getText(){
return text;
}
public String toString(){
return getText();
}
}
|
The Comment class is identical to Text except that its toString() method adds the character sequences that start and end an HTML comment:
// Defines a page element for holding an
// HTML comment.
package webscraper;
public class Comment extends Element {
private String text;
public Comment( String text ){
this.text = text;
}
public String getText(){
return text;
}
public String toString(){
return "<!--" + getText() + "-->";
}
}
|
The Tag class is more complicated because it needs to parse the contents of a tag, the parts between the "< and > characters, into a tag name and a set of attributes:
// Defines a page element for holding an HTML tag.
// The tag name and the attributes are stored
// separately, the latter using an instance of
// Attributes.
package webscraper;
public class Tag extends Element {
private boolean isEnd;
private String name = "";
private Attributes attributes;
// Create an HTML tag. Pass in the content of the
// tag, i.e. for the tag "<body
// bgcolor=#ffffff>" pass in
// "body bgcolor=#ffffff".
public Tag( String tagContent ){
tagContent = tagContent.trim();
int len = tagContent.length();
if( len > 0 ){
int start = 0;
if( tagContent.charAt( 0 ) == '/' ){
isEnd = true;
++start;
}
while( start < len &&
tagContent.charAt( start ) <= ' ' ){
++start;
}
int end;
for( end = start; end < len; ++end ){
if( tagContent.charAt( end ) <= ' ' ) break;
}
name = tagContent.substring( start, end );
if( end < len ){
attributes = new Attributes(
tagContent.substring(
end ) );
}
}
}
public Attributes getAttributes(){
return attributes;
}
public String getName(){
return name;
}
// Whether or not the tag is an end tag.
public boolean isEndTag(){
return isEnd;
}
public String toString(){
StringBuffer buf = new StringBuffer();
buf.append( '<' );
if( isEnd ) buf.append( '/' );
buf.append( name );
if( attributes != null ){
buf.append( ' ' );
buf.append( attributes.toString() );
}
buf.append( '>' );
return buf.toString();
}
}
|
Tag uses a helper class to store the tag attributes. Oddly enough, Attributes is the most complicated of all these classes because HTML allows so much flexibility. This class does the attribute parsing in a lazy manner to conserve memory:
package webscraper;
import java.util.*;
// Defines a class for holding and parsing the
// attributes of an HTML tag. Note that the
// attributes are not parsed until they are
// actually needed.
public class Attributes {
private String attrs;
private String attrsLower;
private Hashtable hashtable;
// Pass in the attribute string from inside the
// tag, i.e. if the tag is "<body
// bgcolor=#ffffff>" pass in the
// " bgcolor=#ffffff" part.
public Attributes( String attrs ){
if( attrs == null ) attrs = "";
this.attrs = attrs.trim();
attrsLower = this.attrs.toLowerCase();
}
public boolean exists( String name ){
return( get( name ) != null );
}
public String get( String name ){
name = name.toLowerCase();
// Delay the parsing by checking if the
// attribute name is in the attribute set.
if( hashtable == null ){
if( attrsLower.indexOf( name ) == -1 ){
return null;
}
parse();
}
return (String) hashtable.get( name );
}
// Parse the attributes. Unlike XML, HTML
// attributes don't need a value and the value may
// or may not be quoted, so parsing is tricky.
private void parse(){
Vector tokens = new Vector();
int len = attrs.length();
char ch;
char delim = ' ';
boolean inWord = false;
boolean inString = false;
StringBuffer buf = new StringBuffer( len );
// Split the attribute string into tokens
for( int i = 0; i <= len; ++i ){
if( inWord ){
ch = ( i < len ? attrs.charAt(i) : ' ' );
if( ch <= ' ' || ch == '=' ){
if( buf.length() > 0 ){
tokens.addElement(
buf.toString() );
buf.setLength( 0 );
}
if( ch == '=' ){
tokens.addElement( "=" );
}
inWord = false;
} else {
buf.append( ch );
}
} else if( inString ){
ch = ( i < len ? attrs.charAt(i) :
delim );
if( ch == delim ){
tokens.addElement( buf.toString() );
buf.setLength( 0 );
inString = false;
} else {
buf.append( ch );
}
} else {
ch = ( i < len ? attrs.charAt(i) : ' ' );
if( ch == '\'' || ch == '"' ){
delim = ch;
inString = true;
} else if( ch > ' ' ){
buf.append( ch );
inWord = true;
}
}
}
// Now store the attributes as name-value pairs
// in a hashtable. If an attribute does not have
// a value, the empty string is used as its value.
hashtable = new Hashtable();
int tlen = tokens.size();
String name = null;
boolean nextIsValue = false;
for( int i = 0; i < tlen; ++i ){
String tok = (String) tokens.elementAt( i );
if( nextIsValue ){
hashtable.put( name, tok );
name = null;
nextIsValue = false;
} else if( name == null ){
name = tok.toLowerCase();
} else if( !tok.equals( "=" ) ){
hashtable.put( name, "" );
name = tok.toLowerCase();
} else {
nextIsValue = true;
}
}
if( name != null ){
hashtable.put( name, "" );
}
}
public String toString(){
return attrs;
}
}
|
With these classes in place you can now build a general-purpose HTML parsing class. PageScraper splits an input stream into a set of page elements:
package webscraper;
import java.io.*;
import java.util.*;
// Parse an input stream into page elements.
public class PageScraper {
private Reader reader;
// These variables implement a simple lookahead
private int first;
private int second = -1;
private int third = -1;
private int fourth = -1;
// Convenience method for parsing a string.
public PageScraper( String text ) throws
IOException {
this( new ByteArrayInputStream(
text.getBytes() ) );
}
// Parse an input stream.
public PageScraper( InputStream in ) throws
IOException {
reader = new InputStreamReader( in );
first = reader.read();
if( first != -1 ) second = reader.read();
if( second != -1 ) third = reader.read();
if( third != -1 ) fourth = reader.read();
}
// Advance to the next character in the stream.
private void next() throws IOException {
if( first != -1 ){
first = second;
second = third;
third = fourth;
fourth = reader.read();
}
}
// Advance n characters into the stream.
private void next( int n ) throws IOException {
while( n-- > 0 ){
next();
}
}
// Returns the next page element. Use the
// instanceof operator to determine if it's a Tag,
// Comment, or Text.
public Element readElement() throws IOException {
StringBuffer buf = new StringBuffer();
boolean inTag = false;
boolean inComment = false;
while( first != -1 ){
if( inTag ){
if( first == '>' ){
next();
break;
}
} else if( inComment ){
if( first == '-' && second == '-' &&
third == '>' ){
next( 3 );
break;
}
} else if( first == '<' ){
if( buf.length() > 0 ) break;
if( second == '!' && third == '-' &&
fourth == '-' ){
inComment = true;
next( 4 );
continue;
} else {
inTag = true;
next();
continue;
}
}
buf.append( (char) first );
next();
}
String content = buf.toString();
if( inTag ){
return new Tag( content );
} else if( inComment ){
return new Comment( content );
} else if( content.length() > 0 ||
first != -1 ){
return new Text( content );
} else {
return null;
}
}
}
|
The parsing is done incrementally, by invoking readElement() to get the next page element. For example, this code...
...
try {
String html = "<!-- a page --><HTML>\n" +
"<body bgcolor=#FFFFFF>" +
"<P>Hah!\n" +
"</body></html>";
PageScraper sc = new PageScraper( html );
Element e;
while( ( e = sc.readElement() ) != null ){
System.out.println( e.getClass().getName() +
": " + e.toString() );
}
}
catch( java.io.IOException ioe ){
}
...
|
...produces the following output on the console:
Comment: <!-- a page -->
Tag: <HTML>
Text:
Tag: <body bgcolor=#FFFFFF>
Tag: <P>
Text: Hah!
Tag: </body>
Tag: </html>
|
You're almost ready to start parsing some web pages. All you need now is the ability to make an HTTP request and get a web page as an input stream. How you do this depends on how you structure the application. In the ideal scenario your MIDlet delegates the parsing to a servlet, thereby offloading most of its work to an external machine. The MIDlet sends a command to the servlet, perhaps a request for a stock quote:
...
HttpConnection conn;
String url = "http://www.mywebserver.com/quote/mostrecent";
conn = Connector.open( url + "?tag=SUN" );
...
|
Here's an example of how you could implement such a servlet; Controller parses Yahoo's free quote service:
package quote;
import webscraper.*;
import java.io.*;
import java.net.*;
import java.util.*;
import javax.servlet.*;
import javax.servlet.http.*;
/**
* A simple example of a servlet that responds to
* an input stream sent to it by a MIDP client.
*/
public class Controller extends HttpServlet {
private static final String QuoteURL =
"http://finance.yahoo.com/q?s=";
public void doGet( HttpServletRequest request,
HttpServletResponse response )
throws IOException, ServletException {
String tag = request.getParameter( "tag" );
Double val = null;
if( tag != null ){
InputStream is = null;
try {
is = new URL( QuoteURL + tag ).openStream();
PageScraper page = new PageScraper( is );
boolean foundFont = false;
boolean foundBold = false;
while( true ){
Element e = page.readElement();
if( e == null ) break;
if( foundFont && foundBold ){
foundFont = false;
foundBold = false;
if( e instanceof Text ){
String num = e.toString();
try {
val = Double.valueOf( num );
break;
}
catch( NumberFormatException
nfe ){
}
}
} else if( foundFont ){
if( e instanceof Tag ){
Tag t = (Tag) e;
if( t.getName().equalsIgnoreCase(
"b" ) ){
foundBold = true;
}
} else {
foundFont = false;
}
} else if( e instanceof Tag ){
Tag t = (Tag) e;
if( t.getName().equalsIgnoreCase(
"font" )){
foundFont = true;
}
}
}
}
catch( IOException e ){
System.out.println( "Exception "
+ e );
}
finally {
if( is != null ){ is.close(); }
}
}
// Set the response headers and data...
String send = ( val != null ? val.toString() :
"error" );
response.setContentType( "text/plain" );
response.setContentLength( send.length() );
response.setStatus( response.SC_OK );
PrintStream out =
new PrintStream(
response.getOutputStream() );
out.print( send.toString() );
out.close();
}
}
|
Note that this servlet looks for the pattern <font><b>number, which is not a very robust test. A full implementation would check for more.
If the application is unable to use a servlet as a proxy, it can still connect to a web server and parse the page itself. You use MIDP's HttpConnection class, of course, and define a PageLoader class to hide all the boring details:
import java.io.*;
import javax.microedition.io.*;
// A helper class that manages the HTTP connection for
// loading a page. Uses the HttpConnectionHelper class
// to automatically handle redirects.
public class PageLoader implements HttpConnectionHelper.Callback {
private HttpConnection conn;
private InputStream in;
private Throwable error;
private int rc;
public PageLoader(){
}
// Call close when done with the page.
public synchronized void close(){
if( in != null ){
try { in.close(); } catch( Exception e ){}
in = null;
}
if( conn != null ){
try { conn.close(); } catch( Exception e ){}
conn = null;
}
error = null;
}
public HttpConnection getConnection() { return conn; }
public Throwable getException() { return error; }
public InputStream getInputStream() { return in; }
public int getResponseCode() { return rc; }
// Loads the page by forming the HTTP request.
// Returns true if HTTP_OK is returned, false if
// any other response code is returned or an
// exception occurs.
public synchronized boolean loadPage( String url ){
close();
try {
conn = HttpConnectionHelper.connect( url, this );
rc = conn.getResponseCode();
if( rc == HttpConnection.HTTP_OK ){
in = conn.openInputStream();
}
}
catch( Throwable e ){
error = e;
}
return ( in != null );
}
// The callback for the HttpConnectionHelper that
// sets up the connection properties for a new
// HttpConnection object.
public void prepareRequest( String originalURL,
HttpConnection conn ) throws IOException {
conn.setRequestMethod( HttpConnection.GET );
conn.setRequestProperty( "User-Agent",
"Profile/MIDP-1.0 Configuration/CLDC-1.0" );
conn.setRequestProperty( "Content-Language", "en-US" );
conn.setRequestProperty( "Accept", "text/html" );
conn.setRequestProperty( "Accept-Charset","iso-8859-1" );
conn.setRequestProperty( "connection", "close" );
}
}
|
Note that PageLoader uses the HttpConnectionHelper class to handle automatically the HTTP redirections that occur often on major web sites.
Finally, you define a MIDlet that uses all these classes. This particular MIDlet first prompts the user for a URL (defaulting to http://wireless.java.sun.com as an example), loads the page, then parses and displays all the <meta> tags it finds on the page:
import java.io.*;
import java.util.*;
import javax.microedition.lcdui.*;
import javax.microedition.midlet.*;
import webscraper.*;
// A MIDlet that uses the PageLoader and PageScraper
// classes to extract all the META tags from the
// start of an arbitrary page. The use is prompted
// for a URL, the page is loaded and parsed, and the
// tag values are displayed in a list.
public class ScraperTest extends MIDlet
implements CommandListener,
Runnable {
private Display display;
private PageLoader loader = new PageLoader();
private String url =
"http://wireless.java.sun.com";
public static final Command exitCommand =
new Command( "Exit",
Command.EXIT, 1 );
public static final Command okCommand =
new Command( "OK",
Command.OK, 1 );
public ScraperTest(){
}
// Process commands. If the OK command is invoked,
// need to determine which screen was active.
public void commandAction( Command c,
Displayable d ){
if( c == exitCommand ){
exitMIDlet();
} else if( c == okCommand &&
d instanceof Prompter ){
// Start the page loading....
url = ((Prompter) d).getString();
if( url.indexOf( "://" ) == -1 ){
url = "http://"+ url;
}
display.setCurrent( new Wait() );
} else if( c == okCommand &&
d instanceof Lister ){
display.setCurrent( new Prompter() );
}
}
protected void destroyApp( boolean unconditional )
throws MIDletStateChangeException {
exitMIDlet();
}
// Displays an error message using an alert.
private void error( String title, String msg,
Throwable e ){
Alert a = new Alert( title );
if( msg == null && e != null ){
msg = "Exception: ";
}
a.setString( msg +
( e != null ? e.toString() : "" ) );
a.setType( AlertType.ERROR );
display.setCurrent( a, new Prompter() );
}
public void exitMIDlet(){
notifyDestroyed();
}
public Display getDisplay(){ return display; }
// Start by prompting the user...
protected void initMIDlet(){
display.setCurrent( new Prompter() );
}
protected void pauseApp(){
}
// Does the actual parsing. Invoked indirectly by
// the wait screen using display.callSerially.
// Loads the page and then parses it for the META
// tags.
public void run(){
if( loader.loadPage( url ) ){
Wait w = (Wait) display.getCurrent();
w.update( "Parsing..." );
try {
PageScraper sc = new
PageScraper(
loader.getInputStream() );
Element e;
Lister list = new Lister();
while( ( e = sc.readElement() ) !=
null ){
if( !( e instanceof Tag ) )
continue;
Tag t = (Tag) e;
String name = t.getName();
if( t.isEndTag() ){
if( name.equals( "head" ) ){
break;
}
} else {
if( name.equals( "meta" ) ){
list.append( t.toString(),
null );
} else if( name.equals(
"body" ) ){
break;
}
}
}
loader.close();
display.setCurrent( list );
}
catch( Exception e ){
error( "Parsing error",
null, e );
}
} else {
Throwable e = loader.getException();
String msg = null;
if( e == null ){
msg = "HTTP status code " +
loader.getResponseCode();
}
loader.close();
error( "Cannot load page", msg, e );
}
}
protected void startApp()
throws MIDletStateChangeException {
if( display == null ){
display = Display.getDisplay( this );
initMIDlet();
}
}
// A simple class that prompts for a URL.
class Prompter extends TextBox {
Prompter(){
super( "Enter a URL:", url,
200, 0 );
addCommand( okCommand );
addCommand( exitCommand );
setCommandListener( ScraperTest.this );
}
}
// Displays our list of tags. The tags are
// added to the list in the run method.
class Lister extends List {
Lister(){
super( "Meta tags:", List.IMPLICIT );
addCommand( okCommand );
setCommandListener( ScraperTest.this );
}
}
// Asks the user to the wait while a lengthy
// operation takes place. The operation does not
// start until the canvas has been painted once,
// at which point display.callSerially is invoked
// to start it.
class Wait extends Canvas {
boolean called = false;
String msg = "Connecting...";
// Draw the message on screen.... fancier
// code would center and wrap the text...
protected void paint( Graphics g ){
g.setColor( 255, 255, 255 );
g.fillRect( 0, 0, getWidth(),
getHeight() );
g.setColor( 0, 0, 0 );
g.drawString( msg, 0, 0,
Graphics.TOP | Graphics.LEFT );
if( !called ){
display.callSerially( ScraperTest.this );
called = true;
}
}
// Update the string and immediately repaint
// the screen.
public void update( String newMsg ){
msg = newMsg;
repaint( 0, 0, getWidth(), getHeight() );
serviceRepaints();
}
}
|
This MIDlet also demonstrates the use of the Display.callSerially() method to ensure that an operation is performed after a Canvas has had a chance to paint itself.
It would be better practice, however, to do the page loading and parsing on a separate thread. Otherwise, the user interface will be unresponsive during the network communication.
About the Author: Eric Giguere is a software developer for iAnywhere Solutions, a subsidiary of Sybase, where he works on Java technologies for handheld and wireless computing. He holds BMath and MMath degrees in Computer Science from the University of Waterloo and has written extensively on computing topics.
Back To Top
|