<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">

<!-- www.matthewwest.co.uk                                              -->
<!-- Layout $LastChangedDate:: 2007-05-12 08:36:32 -0700 (Sat, 12 May#$ -->
<!-- Layout $Revision:: 134                                           $ -->
<!-- Copyright (C) 2005-2007 Matthew West                               -->
  
  <head profile="http://gmpg.org/xfn/11">
    <meta name="title" content="Blog - Link Checker" />
    <meta name="author" content="Matthew West" />
    <title>
      Blog - Link Checker
    </title>
        
    <link href="/stylesheets/site.css?1184434403" media="all" rel="Stylesheet" type="text/css" />
    
    <link href="/stylesheets/print.css?1184459558" media="print" rel="Stylesheet" type="text/css" />
    <link href="/stylesheets/syntax.css?1184434403" media="all" rel="Stylesheet" type="text/css" />
    <link href="/favicon.ico?1167527172" rel="shortcut icon" />
    <style type="text/css">
      
    </style>
          <link href="/blog/2002/8/1/4-photo-album" rel="prev" title="Blog - Photo Album" />
      <link href="/blog/2002/8/1/4-photo-album" rel="first" title="Blog - Photo Album" />
              <link href="/blog/2002/8/20/2-xml-to-xhtml-transformation" rel="next" title="Blog - XML to XHTML transformation" />
      <link href="/blog/2007/9/16/12-find-fixtures-without-thinking" rel="last" title="Blog - Find Fixtures Without Thinking" />
        <link href="http://www.matthewwest.co.uk/" rel="home" title="Index" />
    <link href="#&lt;#&lt;Class:0xb787fe50&gt;:0xb7247644&gt;" rel="up" title="Blog" />
    <link href="http://www.matthewwest.co.uk/blog/feeds" rel="alternate" title="ATOM" type="application/atom+xml" />
    <link href="http://www.matthewwest.co.uk/blog/feeds/rss2" rel="alternate" title="RSS" type="application/rss+xml" />
  </head>

  <body>
    <div id="bannerhead" >
      <span class="right">www.matthewwest.co.uk</span>
    </div>
     
    <div id="crumbs">
              <a href="/blog">Blog</a>
                  &#160;&gt;&#160;
                      <a href="/blog/2002/8/19/3-link-checker">Link Checker</a>
                        </div>
    <h1>
      Blog - Link Checker
    </h1>
        
      <div class="floatr">
    <a href="/blog/2002/8/20/2-xml-to-xhtml-transformation">XML to XHTML transformation&raquo;</a>
  </div>
  <div>
    <a href="/blog/2002/8/1/4-photo-album">&laquo;Photo Album</a>
  </div>
<div style="width:100%;clear:both;">
</div>

<div class="sidebarr tag-cloud">
  Tags:<br />
  <a href="http://www.matthewwest.co.uk/tags/Lucky+Soul" style="font-size: 58%;">Lucky Soul</a> <a href="http://www.matthewwest.co.uk/tags/avolites" style="font-size: 48%;">avolites</a> <a href="http://www.matthewwest.co.uk/tags/chamsys" style="font-size: 58%;">chamsys</a> <a href="http://www.matthewwest.co.uk/tags/computing" style="font-size: 77%;">computing</a> <a href="http://www.matthewwest.co.uk/tags/echelon" style="font-size: 82%;">echelon</a> <a href="http://www.matthewwest.co.uk/tags/hog+1000" style="font-size: 48%;">hog 1000</a> <a href="http://www.matthewwest.co.uk/tags/hog+2" style="font-size: 48%;">hog 2</a> <a href="http://www.matthewwest.co.uk/tags/hog+3" style="font-size: 110%;">hog 3</a> <a href="http://www.matthewwest.co.uk/tags/hog+iPc" style="font-size: 110%;">hog iPc</a> <a href="http://www.matthewwest.co.uk/tags/java" style="font-size: 58%;">java</a> <a href="http://www.matthewwest.co.uk/tags/lighting" style="font-size: 150%;">lighting</a> <a href="http://www.matthewwest.co.uk/tags/magicq" style="font-size: 58%;">magicq</a> <a href="http://www.matthewwest.co.uk/tags/music" style="font-size: 48%;">music</a> <a href="http://www.matthewwest.co.uk/tags/pearl+2004" style="font-size: 48%;">pearl 2004</a> <a href="http://www.matthewwest.co.uk/tags/pearl+expert" style="font-size: 48%;">pearl expert</a> <a href="http://www.matthewwest.co.uk/tags/rails" style="font-size: 48%;">rails</a> <a href="http://www.matthewwest.co.uk/tags/ruby" style="font-size: 48%;">ruby</a> <a href="http://www.matthewwest.co.uk/tags/sailing" style="font-size: 133%;">sailing</a> <a href="http://www.matthewwest.co.uk/tags/website" style="font-size: 77%;">website</a> <a href="http://www.matthewwest.co.uk/tags/xhtml" style="font-size: 72%;">xhtml</a> <a href="http://www.matthewwest.co.uk/tags/xml" style="font-size: 58%;">xml</a> <a href="http://www.matthewwest.co.uk/tags/xsl" style="font-size: 65%;">xsl</a> 
</div>

<dl class="sidebarr frame">
  <dt class="header">Recent Updates</dt>
      <dd><a href="http://www.matthewwest.co.uk/blog/2007/9/16/12-find-fixtures-without-thinking" rel="bookmark">Find Fixtures Without Thinking</a></dd>
      <dd><a href="http://www.matthewwest.co.uk/blog/2007/6/3/11-bush-hall-reviews" rel="bookmark">Bush Hall Reviews</a></dd>
      <dd><a href="http://www.matthewwest.co.uk/blog/2007/5/25/10-lucky-soul-at-bush-hall" rel="bookmark">Lucky Soul at Bush Hall</a></dd>
      <dd><a href="http://www.matthewwest.co.uk/blog/2005/2/21/9-xsl-contents" rel="bookmark">XSL Contents</a></dd>
      <dd><a href="http://www.matthewwest.co.uk/blog/2007/4/15/8-new-website" rel="bookmark">New Website</a></dd>
  </dl>

<div class="post hentry">
  <h2 class="entry-title"><a href="http://www.matthewwest.co.uk/blog/2002/8/19/3-link-checker" rel="bookmark">Link Checker</a></h2>
  <h4 class="tags floatr">
    Tags: <a href="http://www.matthewwest.co.uk/tags/computing" rel="tag">computing</a>, <a href="http://www.matthewwest.co.uk/tags/website" rel="tag">website</a>, <a href="http://www.matthewwest.co.uk/tags/java" rel="tag">java</a>, <a href="http://www.matthewwest.co.uk/tags/xhtml" rel="tag">xhtml</a>
  </h4>
  <h4 class="datetime">
          Posted
      <abbr class="published" title="2002-08-19T23:38:00Z">
        at August 19, 2002 23:38
      </abbr>
      | Updated
      <abbr class="modified" title="2007-04-13T22:03:05Z">
        at April 13, 2007 22:03
      </abbr>
      </h4>
  <div class="entry-content">
    
    <p><code>LinkChecker</code> is a program that checks web page links to make sure that they work.  When writing web pages it is easy to make a small typing error that results in a link that doesn&#8217;t work.  Checking all the links on a web site is time consuming, and it is hard to be sure that they have all been checked.  This program checks each link on each page one by one, to make sure that they all work.</p>


	<h3>Usage</h3>


	<p>The program can be run from the command line like this:</p>


	<pre><code>java -jar /path/to/LinkChecker.jar http://www.mysite.com/index.html -d2000</code></pre>


	<p>This will load the index page of <code>www.mysite.com</code> and will check that every link works.  Each page that it finds a link to that is also on he same server will be checked in turn.  Thus the whole site should eventually be checked, as long as there are links between all the pages.  The <code>-d</code> option specifies a delay time in milliseconds.  <code>-d2000</code> specifies a delay of 2 seconds between each page access.  This is avoid overloading the server.  Each valid address is cached, so that it only needs to be looked up once.  If the computer is hosting a copy of a website, and is not connected to the Internet then the <code>-l</code> will cause it to only check local addresses.</p>


	<p>The program can only check web pages that are well-formed, which means that they comply with the w3c <span class="caps">XML</span> specification.  This probably means that it is limited to <span class="caps">XHTML</span> web pages.  The <span class="caps">DTD</span>&#8217;s for <span class="caps">XHTML</span> are included in the .jar file.  If the web pages use different <span class="caps">DTD</span>&#8217;s then the program will only work if the computer is connected to the Internet.</p>


	<p>The program can also be called directly from an Ant build script.  First of all the task must be defined:</p>


	<pre><code>&lt;taskdef name="checklinks" 
           classname="uk.co.matthewwest.LinkChecker.LinkCheckerTask" 
       classpath="/path/to/LinkChecker.jar" /&gt;</code></pre>


	<p>Then the task can be called like an other Ant task:</p>


	<pre><code>&lt;checklinks href="http://www.mysite.com/index.html" 
            delay="150" local="no" /&gt;</code></pre>


	<h3>Version 0.2</h3>


	<p>Version 0.2 corrects a couple of problems caused by upgrading of other parts.  Apache now gives longer content headers which include the charset while version 0.1 was looking for just &#8220;text/html&#8221;.  Modern versions of Java include an <span class="caps">XML</span> parser which gives a <span class="caps">SAX</span> warning seemingly every time it comes accross an entity, so version 0.2 suppresses warnings unless &#8216;debug&#8217; is set to true.</p>


	<h3>Libraries</h3>


	<p>Version 0.1 of the program is supplied with <a href="http://xml.apache.org/xerces2-j/index.html ">Xerces</a>, an <span class="caps">XML</span> parser developed by the Apache Foundation.  Version 0.2 assumes a more modern version of Java, which comes with an <span class="caps">XML</span> library built in.  <a href="http://jakarta.apache.org/ant/ ">Ant</a> is used to build the project.  There are also some <span class="caps">DTD</span>&#8217;s included which were downloaded from the <a href="http://www.w3c.com/">w3c</a> website.</p>


	<h3>Download</h3>


	<p>The program is available for download as a <a href="/docs/LinkChecker-0.2.zip">zip file</a> , or as a <a href="/docs/LinkChecker-0.2.tar.gz">tar file</a> .  My code is released under the terms of the <span class="caps">GPL</span>.</p>


	<h3>How It Works</h3>


	<p>Each page is parsed by a <span class="caps">SAX</span> parser.  Every time the parser encounters a link(<code>&lt;a&gt;, &lt;link&gt;, &lt;img&gt;, &lt;form&gt;</code>) it calls <code>checkUrl()</code>.  This checks a <code>HashSet</code> of previously validated addresses, and if the <span class="caps">URL</span> is not among them it attempts to connect to a server to retrieve it.  If it can then the <span class="caps">URL</span> is added to the <code>HashSet</code>.  If the <span class="caps">URL</span> is on the same server as the first page and hasn&#8217;t been checked it is added to a <code>LinkedList</code> of pages pending checking.  If it cannot retrieve the page then it prints a message.  It then moves on to the next page in the list.</p>
      </div>
</div>


    <div id="footer">
             <span class="left">
          Last modified : 13 Apr 2007
        </span>
        
      <span class="right vcard">
       <a href="http://www.matthewwest.co.uk/website-information">Copyright &#169; 2002-2007</a>
       <a href="/email/send_email" class="fn" id="mehcardname" title="E-mail me">Matthew West</a>
      </span>
    </div>
    
    
<!-- Menu                                                            -->
<!-- $Rev:: 10#$ $Date:: 2006-12-11 03:28:11 +0000 (Mon, 11 Dec 20#$ -->  
    <div id="menu">
    <ul class="main">
      <li class="page vcard">
        <a href="#mehcardname" class="include"></a>
        <a href="http://www.matthewwest.co.uk/" class="url" rel="me">Index</a>
      </li>
      <li class="menu"><a href="/professional-experience">Professional</a>  <ul class="sub"><li class="page"><a href="/docs/cv.pdf">Resumé</a></li><li class="page"><a href="/professional-experience/earlier-lighting-work">Earlier Work</a></li><li class="page"><a href="/professional-experience/lighting-photos">Portfolio</a></li>  </ul></li><li class="menu"><a href="/lightingdb">Lighting Database</a>  <ul class="sub"><li class="menu"><a href="/lightingdb/showmanufacturer/Clay Paky">Clay Paky</a>  <ul class="sub"><li class="section"><a href="/lightingdb/showlantern/Clay Paky/Golden Scan HPE">Golden Scan HPE</a></li><li class="section"><a href="/lightingdb/showlantern/Clay Paky/MiniScan">MiniScan</a></li>  </ul></li><li class="menu"><a href="/lightingdb/showmanufacturer/High End">High End</a>  <ul class="sub"><li class="section"><a href="/lightingdb/showlantern/High End/Studio Spot 575">Studio Spot 575</a></li>  </ul></li><li class="menu"><a href="/lightingdb/showmanufacturer/Martin">Martin</a>  <ul class="sub"><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 600E">Mac 600E</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 2000 Wash">Mac 2000 Wash</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 500">Mac 500</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 250">Mac 250</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 2000 Profile">Mac 2000 Profile</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 500E">Mac 500E</a></li><li class="section"><a href="/lightingdb/showlantern/Martin/Mac 550">Mac 550</a></li><li class="page">...</li>  </ul></li>  </ul></li><li class="menu"><a href="/photos">Photos</a>  <ul class="sub"><li class="page"><a href="/photos/all">See them all</a></li><li class="menu"><a href="/photos/holidays">Holidays</a>  <ul class="sub"><li class="page"><a href="/photos/holidays/lofoten-islands">Lofoten</a></li><li class="section"><a href="/photos/holidays/eastern-europe">Eastern Europe</a></li><li class="page"><a href="/photos/holidays/cornwall-break">Cornwall Break</a></li><li class="page"><a href="/photos/holidays/cornish-sailing">Cornish Sailing</a></li>  </ul></li><li class="menu"><a href="/photos/lighting-photos">Lighting Photos</a>  <ul class="sub"><li class="page"><a href="/photos/lighting-photos/hampton-court">Hampton Court</a></li><li class="page"><a href="/photos/lighting-photos/dreamgirls">Dreamgirls</a></li>  </ul></li><li class="menu"><a href="/photos/misc">Other photos</a>  <ul class="sub"><li class="page"><a href="/photos/misc/bristol">Bristol</a></li><li class="page"><a href="/photos/misc/portsmouth">Portsmouth</a></li><li class="page"><a href="/photos/misc/oakley">Oakley</a></li><li class="page"><a href="/photos/misc/kingston">Kingston</a></li><li class="page"><a href="/photos/misc/cats-in-portugal">Cats</a></li>  </ul></li>  </ul></li><li class="page"><a href="/blog">Blog</a></li><li class="page"><a href="/links">Links</a></li><li class="page"><a href="/site-info">Site Info</a></li><li class="page"><a href="/email">Contact Me</a></li>
      <li class="google">
        <form action="http://www.google.com/custom" method="get">
     	  <div>
	        <a href="http://www.google.com/search">
  	          <img alt="Google" width="128" height="53" src="/images/google40.png" />
	        </a>
	        <input class="text" value="" maxlength="255" size="12" name="q" type="text" /><br />
	        <div>
	          <input value="" name="sitesearch" type="radio" />
	          All WWW<br />
	          <input checked="checked" value="www.matthewwest.co.uk" name="sitesearch" type="radio" />
	          This site only<br />
	        </div>
	        <input type="hidden" name="cof"
		           value="S:http://www.matthewwest.co.uk;GL:0;VLC:#3333ff;AH:left;LH:80;LC:#3333ff;L:http://www.matthewwest.co.uk/images/h80google.png;ALC:#3333ff;BIMG:http://www.matthewwest.co.uk/images/beams.png;LW:1165;AWFID:ab0c1f8918996ca6;"
		           />
	        <input value="www.matthewwest.co.uk" name="domains" type="hidden" />
	        <input value="Search" name="sa" type="submit" />
	      </div>
        </form>
      </li>
    </ul>
        <script type="text/javascript" src="http://embed.technorati.com/embed/hi6ybdyt32.js"> </script>
  </div>

  </body>
</html>