#!/usr/bin/env python
"""
This script scanns a given website for broken links.
It extracts all href="(..)" strings and tests, if the response from the
webserver is 200 (HTTP OK). If not, shows you the error links.
Parallelization for fast testing.

Author: Harald Schilly <harald.schilly@gmail.com>
Copyright: 2009, Vienna, Austria
License: BSD
"""
import urllib
import socket
import os, sys
import re
import threading
socket.setdefaulttimeout(10) #never wait longer than 10 secs


def testlink(what, BASE):   
    """
    called on each given URL
    """
    links = []
    for n, l in enumerate(page.split('\n')):
        # n is line number
        for ll in re.findall(r'''href=['"](.*?)['"]''', l, re.I):
           links.append((n, ll))
    
    status = []
    def task(idx, link, status):
         """
         the actual task of testing a website 
         """
         s = None
         try:
              opener = urllib.FancyURLopener({})
              u = opener.open(link[1])
              s = u.getcode() is 200
         except IOError:
              # socket timeout
              s = False
         if s is None or s is False:
             status.append((link, s))
         sys.stdout.write('[%-2s] %-80s %5.0f' % (idx, link[:75], s) + ' '*10 + '\r')
          
    
    # parallelization. safe, since most of the time we
    # are waiting for the network to respond
    print 'Testing ...', 
    tasks = []
    for idx, link in enumerate(links):
         if link[1].startswith('mailto'):
            continue
         if not link[1].startswith('http') and not link[1].startswith('ftp:'):
            link = (link[0], BASE + '/' + link[1])
         t = threading.Thread(target=task, args=(idx,link,status))
         t.start()
         tasks.append(t)
    
    # synchronization point
    for t in tasks:
         t.join(timeout=30)
   
    # output
    
    out = sorted([ l for (l,s) in status if s is False ])
    if len(out) is 0: 
       print 'all OK!', ' '*90
       sys.exit(0)
    print 'These links are broken or website says they are not ok for some reason.'
    print '\n'.join([ '%5s - %s' % o for o in out])


if __name__=='__main__':
    if len(sys.argv) <= 1:
       print 'Usage: ', sys.argv[0], 'URL... list to check for broken links'
       print '       URLs must point to a certain html page'
       sys.exit(1)
    
    for item in sys.argv[1:]:
       if item.startswith('http:') or item.startswith('https:'):
           page = urllib.urlopen(item).read()
           BASE = item[:item.rfind('/')] # everything before the last /
       else:
       #    page = open(item).read()
       #    BASE = os.path.curdir
            print "ERROR: an URL must start with http and point to a html page"
            sys.exit(1)
    
       testlink(page, BASE)
