#!/usr/bin/python

import sys, os
import re
import Debug
import time

# A python version of the perl script.  Should be a lot cleaner.

debugLevel = 1
saveTempfiles = 0


d = Debug.Debug(debugLevel)
class Site:

	def __init__(self, siteName, siteURL):
		self.siteName = siteName
		self.siteURL = siteURL
		matchBase = re.compile(r"""(http://[^/]*).*$""")
		self.siteBaseURL = matchBase.sub(r'\1', self.siteURL)
		d.dprint(2, "matchBase for %s is %s." % (self.siteURL, self.siteBaseURL))
		self.tempFile = os.popen('tempfile').read()[:-1]
		d.dprint(3, "tempFile for %s is %s." % (self.siteName, self.tempFile))
		self.wgetCommand = "wget -t 3 %s -O %s > /dev/tty" % (self.siteURL, self.tempFile)
		d.dprint(2, "wgetCommand for %s is %s." % (self.siteName, self.wgetCommand))
	
	def test(self):
		print "Site is %s, URL is %s" % (self.siteName, self.siteURL)

	def appendData(self, fileToWrite):
		fileToWrite.write("<h2><a href=\"%s\">%s</a>:</h2>\n" % (self.siteURL, self.siteName))

		doubleSlash = re.compile(r""""//""")
#		href = re.compile(r"""([Hh][Rr][Ee][Ff]=\")/""")
#		virtual = re.compile(r"""([Vv][Ii][Rr][Tt][Uu][Aa][Ll]=\")/""")
#		src = re.compile(r"""([Ss][Rr][Cc]=\")/""")
		generalSlash = re.compile(r"""(=\"?)/""")
		generalSlash2 = re.compile(r"""(='?)/""")
		form = re.compile(r"""(<form.*?action=)(/)(.*?>)""", re.IGNORECASE)
		refresh = re.compile(r"""^(.*)<meta.*?http-equiv="refresh".*?>(.*)$""", re.IGNORECASE)
		img = re.compile(r"""(<img src=)\s*(/\w+)""", re.IGNORECASE)
		
		os.system(self.wgetCommand)
		tempRealFile = open(self.tempFile, 'r')
#		for line in os.popen(self.wgetCommand).readlines():
		for line in tempRealFile.readlines():
			line = doubleSlash.sub(r'"http://', line)
#			line = href.sub('\\1%s/' % self.siteBaseURL, line)
#			line = virtual.sub('\\1%s/' % self.siteBaseURL, line)
#			line = src.sub('\\1%s/' % self.siteBaseURL, line)
			line = generalSlash.sub('\\1%s/' % self.siteBaseURL, line)
			line = generalSlash2.sub('\\1%s/' % self.siteBaseURL, line)
			line = form.sub('\\1%s/\\3' % self.siteBaseURL, line)
			line = refresh.sub('\\1\\2', line)
			line = img.sub('\\1%s\\2' % self.siteBaseURL, line)
			fileToWrite.write(line)	
		tempRealFile.close()
		if (not saveTempfiles):
			os.system('rm -f %s' % (self.tempFile))


class Cookiefile(Site):

	def __init__(self, siteName, siteURL, cookieFile):
		Site.__init__(self, siteName, siteURL)
		# Override wgetCommand
		self.wgetCommand = "wget -t 3 --cookies=on --load-cookies %s  %s -O %s > /dev/tty" % (cookieFile, self.siteURL, self.tempFile)
		d.dprint(2, "REAL wgetCommand for %s is %s." % (self.siteName, self.wgetCommand))


listOfSites = [
	Cookiefile("Slashdot", "http://slashdot.org", "~/.w3m/slashdotcookie"),
	Site("Google News", "http://news.google.com"),
	Site("CNN", "http://www.cnn.com"),
	Site("Houston Chronicle", "http://www.chron.com"),
	Site("Austin American-Statesman", "http://www.statesman.com"),
	Cookiefile("kuro5hin", "http://www.kuro5hin.org", "~/.w3m/kuro5hincookie"),
	Site("Christian Science Monitor", "http://www.csmonitor.com"),
	Site("Newsforge", "http://newsforge.com"),
	Site("ESPN", "http://espn.go.com/main.html"),
	Site("The Register", "http://www.theregister.co.uk"),
	Site("geekaustin", "http://www.geekaustin.org"),
	Site("Freshmeat", "http://freshmeat.net")]

def doHeader(fileToWrite):
	fileToWrite.write("<html><body>\n")
	fileToWrite.write("Last updated: %s" % time.ctime(time.time()))
	

def main():
#	cnn = Site("CNN", "http://www.cnn.com")
#	cnn.test()
	if (d.level >= 4):
		for site in listOfSites:
			site.test()
	if (len(sys.argv[1:]) == 0):
		d.dprint(4,"len is 0")
		fileToWrite = 'testgetnews.html'
	else:
		fileToWrite = sys.argv[1]
	d.dprint(3,"fileToWrite is %s" % fileToWrite)
	realFile = open(fileToWrite, 'w')
	doHeader(realFile)
	for site in listOfSites:
		site.appendData(realFile)
	realFile.close()


if (__name__ == '__main__'):
	main()
