TOC

Common Log Format (通用日志格式)

  1. NCSA HTTPd (Apache HTTP Server 前身) 定义的一个标准 Web 服务器日志格式。
  2. 格式:host ident authuser date request status bytes
    例如:127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
  3. 如果哪一个字段没有值,就用 - 代替。
import re
from datetime import datetime

RE_CLF = re.compile(r'(\S+) (\S+) (\S+) \[(.*?)\] "(.*?)" (\d{3}) (\d+|-)')

def parse_clf(log_line):
    match = RE_CLF.match(log_line)
    if not match:
        raise ValueError('Log line does not match CLF format')

    ip_address = match.group(1)
    identity = match.group(2)
    user = match.group(3)
    time_str = match.group(4)
    request_line = match.group(5)
    status_code = int(match.group(6))
    size = match.group(7)

    time_format = '%d/%b/%Y:%H:%M:%S %z'
    timestamp = datetime.strptime(time_str, time_format)

    size = int(size) if size != '-' else None

    return {
        'host': ip_address,
        'ident': identity,
        'authuser': user,
        'date': timestamp,
        'request': request_line,
        'status': status_code,
        'bytes': size,
    }

log_example = '127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] 'GET /apache_pb.gif HTTP/1.0' 200 2326'
parsed_log = parse_clf(log_example)
print(parsed_log)
# {'host': '127.0.0.1', 'ident': 'user-identifier', 'authuser': 'frank',
#  'date': datetime.datetime(2000, 10, 10, 13, 55, 36, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
#  'request': 'GET /apache_pb.gif HTTP/1.0', 'status': 200, 'bytes': 2326}
如果你有魔法,你可以看到一个评论框~