核心的正则模块是直接从apachelog模块里拿出来的,我自己几乎什么都没做
apahelog模块地址:http://pypi.python.org/pypi/apachelog/1.0
- #!/usr/bin/env python
- #coding=gbk
- """
- Apache Log Parser,
- see module apahcelog:
- http://pypi.python.org/pypi/apachelog/1.0
- """
- import re
- import sys
- import ConfigParser
- def getFormat():
- # get ‘format’ from format.ini
- config = ConfigParser.ConfigParser()
- try:
- inifh = open(‘format.ini‘)
- except IOError:
- print "Can not load format.ini !"
- sys.exit(1)
- config.readfp(inifh)
- formatName = ”
- format = {}
- try:
- formatName = config.get(‘type‘,‘name‘)
- except ConfigParser.NoOptionError:
- print "Bad section/option name! Please check format.ini"
- sys.exit(1)
- try:
- options = config.options(‘format‘)
- except ConfigParser.NoOptionError:
- print "Can not enum option [format]"
- sys.exit(1)
- for opt in options:
- if opt not in format.keys():
- format[opt] = config.get(‘format‘,opt)
- else:
- print "duplicate name in option [format]"
- sys.exit(1)
- try:
- ret = format[formatName]
- except KeyError:
- print "Format /"%s/" has not been defined in format.ini" %formatName
- sys.exit(1)
- return ret
- # intercept from module apachelog
- class ApacheLogParserError(Exception):
- pass
- class parser():
- def __init__(self, format):
- self._names = []
- self._regex = None
- self._pattern = ”
- self._parse_format(format)
- def _parse_format(self, format):
- format = format.strip()
- format = re.sub(‘[ /t]+‘,‘ ‘,format)
- subpatterns = []
- findquotes = re.compile(r‘^//"‘)
- findreferreragent = re.compile(‘Referer|User-Agent‘)
- findpercent = re.compile(‘^%.*t$‘)
- lstripquotes = re.compile(r‘^//"‘)
- rstripquotes = re.compile(r‘//"$‘)
- for element in format.split(‘ ‘):
- hasquotes = 0
- if findquotes.search(element): hasquotes = 1
- if hasquotes:
- element = lstripquotes.sub(”, element)
- element = rstripquotes.sub(”, element)
- self._names.append(self.alias(element))
- subpattern = ‘(/S*)‘
- if hasquotes:
- if element == ‘%r‘ or findreferreragent.search(element):
- subpattern = r‘/"([^"//]*(?://.[^"//]*)*)/"‘
- else:
- subpattern = r‘/"([^/"]*)/"‘
- elif findpercent.search(element):
- subpattern = r‘(/[[^/]]+/])‘
- elif element == ‘%U‘:
- subpattern = ‘(.+?)‘
- subpatterns.append(subpattern)
- self._pattern = ‘^‘ + ‘ ‘.join(subpatterns) + ‘$‘
- try:
- self._regex = re.compile(self._pattern)
- except Exception, e:
- raise ApacheLogParserError(e)
- def parse(self, line):
- line = line.strip()
- match = self._regex.match(line)
- if match:
- data = {}
- for k, v in zip(self._names, match.groups()):
- data[k] = v
- return data
- raise ApacheLogParserError("Unable to parse: %s" % line)
- def alias(self, name):
- return name
上面是我修改的模块内容,去除了一些用不到的东西,加了个配置文件,定义为AccessLog.py
- #!/usr/bin/env python
- # coding=gbk
- # author : Python[AT]Live.it
- import re
- import sys
- import AccessParse
- try:
- LogFile = sys.argv[1]
- except IndexError:
- print "Usage : Python %s access.log" %sys.argv[0]
- sys.exit(0)
- LogFormat = {
- ‘%h‘:‘Remote-IP‘,
- ‘%l‘:‘Login‘,
- ‘%u‘:‘User‘,
- ‘%t‘:‘AccessTime‘,
- ‘%r‘:‘Request‘,
- ‘%>s‘:‘Status‘,
- ‘%b‘:‘Bytes‘,
- ‘%{Referer}i‘:‘Referer‘,
- ‘%{User-Agent}i‘:‘User-Agent‘
- }
- class myParser(AccessParse.parser):
- def __init__(self,format):
- AccessParse.parser.__init__(self,format)
- def alias(self, name):
- return LogFormat[name]
- p = myParser(AccessParse.getFormat())
- for line in open(LogFile):
- try:
- data = p.parse(line)
- print data
- except AccessParse.ApacheLogParserError,e:
- print "Parser Error , %s" %e
- print "/nMaybe got a bad format in format.ini"
- sys.exit(1)
然后定义一个format.ini
- [type]
- name = combined
- [format]
- common = %h %l %u %t /"%r/" %>s %b
- combined = %h %l %u %t /"%r/" %>s %b /"%{Referer}i/" /"%{User-Agent}i/"
这里指明要分析combined格式脚本,如果需要添加其他格式的话,在ini里继续添加,不过加的内容多的话,恐怕LogFromat字典还要改一下,那个字典不全,或是直接不用它也好,目前是字典内容直接打印出来的,可以根据需要匹配一下
E:/ApacheLogParser>python LogParser.py access.log
{’Status’: ‘301′, ‘Remote-IP’: ‘127.0.0.1′, ‘Bytes’: ‘235′, ‘AccessTime’: ‘[03/Jun/2009:10:53:19 +0800]‘, ‘Request’: ‘POST /wordpress HTTP/1.1′, ‘User-Agent’: “PySpider/1.0 (Python’s Spider | Python[AT]Live.it)”, ‘Referer’: ‘-’, ‘User’: ‘-’, ‘Login’: ‘-’}
最后,关于apache日志的一些正则,最原汁的在这里:
http://cpan.uwinnipeg.ca/htdocs/Apache-LogRegex/Apache/LogRegex.html
apachelog模块的作者也是参照这个写的
记录一下几个模块
apachelog (python)
Apache::LogRegex (perl)
Parse::AccessLogEntry (perl)