ProblemYou need to extract all the HTML tags from a URL. SolutionUse this simple HTML tag extractor. DiscussionA simple HTML extractor can be made by reading a character at a time and looking for < and > tag delimiters. This is reasonably efficient if a BufferedReader is used. The ReadTag program shown in Example 18-7 implements this; given a URL, it opens the file (similar to TextBrowser in Recipe 18.7) and extracts the HTML tags. Each tag is printed to the standard output. Example 18-7. ReadTag.java/** A simple but reusable HTML tag extractor. */ public class ReadTag { /** The URL that this ReadTag object is reading */ protected URL myURL = null; /** The Reader for this object */ protected BufferedReader inrdr = null; /* Simple main showing one way of using the ReadTag class. */ public static void main(String[] args) throws MalformedURLException, IOException { if (args.length == 0) { System.err.println("Usage: ReadTag URL [...]"); return; } for (int i=0; i<args.length; i++) { ReadTag rt = new ReadTag(args[0]); String tag; while ((tag = rt.nextTag( )) != null) { System.out.println(tag); } rt.close( ); } } /** Construct a ReadTag given a URL String */ public ReadTag(String theURLString) throws IOException, MalformedURLException { this(new URL(theURLString)); } /** Construct a ReadTag given a URL */ public ReadTag(URL theURL) throws IOException { myURL = theURL; // Open the URL for reading inrdr = new BufferedReader(new InputStreamReader(myURL.openStream( ))); } /** Read the next tag. */ public String nextTag( ) throws IOException { int i; while ((i = inrdr.read( )) != -1) { char thisChar = (char)i; if (thisChar == '<') { String tag = readTag( ); return tag; } } return null; } public void close( ) throws IOException { inrdr.close( ); } /** Read one tag. Adapted from code by Elliotte Rusty Harold */ protected String readTag( ) throws IOException { StringBuffer theTag = new StringBuffer("<"); int i = '<'; while (i != '>' && (i = inrdr.read( )) != -1) { theTag.append((char)i); } return theTag.toString( ); } /* Return a String representation of this object */ public String toString( ) { return "ReadTag[" + myURL.toString( ) + "]"; } } When I ran it on one system (apparently part-way through converting to modern lowercase HTML tags), I got the following output: darian$ java ReadTag http://localhost/ <html> <head> <title> </title> </head> <FRAMESET BORDER="0" ROWS="110, *" FRAMESPACING="0"> <FRAME NAME="header" src="/books/2/213/1/html/2/header.html" SCROLLING="NO" MARGINHEIGHT="0" FRAMEBORDER="0"> <FRAMESET COLS="130, *" FRAMESPACING="0"> <FRAME NAME="menu" src="/books/2/213/1/html/2/menu.html" SCROLLING="NO" MARGINHEIGHT="0" FRAMEBORDER="0"> <FRAME NAME="main" src="/books/2/213/1/html/2/main.html" MARGINHEIGHT="15" MARGINWIDTH="15" FRAMEBORDER="0"> </FRAMESET> </FRAMESET> </html> darian$ |