#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2006 Torbjörn Svensson <azoff@se.linux.org>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA

import time;
import sys;
import re;


filename = "localhost_access_log."

if len(sys.argv) == 2:
	filename += sys.argv[1]
else:
	filename += time.strftime("%Y-%m-%d");

filename += ".txt"

# blacklisted ips
ips = [
		'207.68.146.55',
		'207.68.146.62',
		'207.68.146.68',
		'207.68.188.244',
		'64.4.8.100',
		'64.4.8.118',
		'65.54.188.130',
		'65.54.188.131',
		'65.54.188.132',
		'65.54.188.133',
		'66.249.64.13',
		'66.249.64.15',
		'66.249.64.16',
		'66.249.64.27',
		'66.249.64.30',
		'66.249.64.33',
		'66.249.64.37',
		'66.249.64.38',
		'66.249.64.4',
		'66.249.64.42',
		'66.249.64.47',
		'66.249.64.50',
		'66.249.64.52',
		'66.249.64.58',
		'66.249.64.66',
		'66.249.64.68',
		'66.249.64.77',
		'66.249.65.144',
		'66.249.65.99',
		'66.249.66.75',
		'66.249.71.1',
		'66.249.71.13',
		'66.249.71.18',
		'66.249.71.28',
		'66.249.71.32',
		'66.249.71.33',
		'66.249.71.39',
		'66.249.71.40',
		'66.249.71.41',
		'66.249.71.44',
		'66.249.71.45',
		'66.249.71.50',
		'66.249.71.53',
		'66.249.71.54',
		'66.249.71.56',
		'66.249.71.57',
		'66.249.71.62',
		'66.249.71.67',
		'66.249.71.69',
		'66.249.71.72',
		'66.249.71.9',
		'64.62.168.76',
		'66.249.66.75',
		'72.30.102.83',
		'72.30.110.221',
		'72.30.111.6',
		'72.30.221.161',
		'2.30.107.162',
		'72.30.128.158',
		'216.255.229.235',
		'70.42.51.10',
		'72.3.225.37',
		'66.147.154.3',
		'210.150.10.92',
		'81.93.165.197',
		'70.42.51.30',
		'38.100.225.7',
		'209.191.65.127',
		'66.249.64',
		'66.249.71',
		'66.249.66',
		'212.214.136.130',
	]

regexp = re.compile('^.*&(page=[^&]*)')
	
data = {}

for row in file(filename):
	# skip \n
	row = row.strip()

	# does not have &page=
	if row.find('&page=') == -1:
		continue

	rec = row.split(' ')
	
	# bot..
	if rec[0] in ips:
		continue

	# work with URI
	work = rec[6].replace('%5f', '_')
	work = regexp.search( work ).groups()[0]
	work = work.replace('"', '')

	# inc or set value of a given page (page saved in work)
	data[work] = data.has_key(work) and data[work]+1 or 1



#print data
# (key,val) -> (val,key) # for sort..
list = [ [val,key] for key,val in data.items() ] 

# sort it (Reverse!)
list.sort(None, None, True)

for count,page in list:
	print "%10d  %s" % (count,page) 

sys.exit(0);

