#!/usr/bin/env python

"""
This script searches for .html files in a file directory and combines 
these files with a give URL prefix to build URLs. They are passed to 
brokenlinks.py

Author: Harald Schilly <harald.schilly@gmail.com>
Copyright: 2009, Vienna, Austria.

"""

#from glob import glob
import sys
import re
import os
import subprocess

if __name__=='__main__':
  if len(sys.argv) <= 2:
      print 'Usage: ', sys.argv[0], '( directory of .html files ) ( URL prefix with trailing / )'
      sys.exit(1)

  pattern = re.compile(r'.html$|.htm$')
  tocheck = []

  for path, directories, files in os.walk(sys.argv[1]):
     p = path[len(sys.argv[1]):]
     #for dir in directories:
     #   print p + '/' + dir
     for file in files:
        if len(pattern.findall(file)) > 0:
           f = '/'.join([p, file])
           print 'adding %-80s to the list of URLs' % f
           tocheck.append(''.join([sys.argv[2], f]))

  for url in tocheck:
     print '#'*100
     print 'URL:', url
     subprocess.call([ 'python', 'brokenlinks.py', url ])

