500 MB access log file from apache in common log format:
www.example.com 1.2.3.4 - - [16/Feb/2013:16:17:04 -0500] "GET /movies/movie-posters/t1234.jpg HTTP/1.1" 200 29670 "http://www.example.com/movies/movie-posters/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
I want to know how much traffic each web site (virtual host) made. Let’s start with the results:
Lang | Method | CPU sec | CPU usr | CPU sys | % C | % perl |
C (g++ -O2) | pointer | 1.28 | 1.04 | 0.23 | 100% | 28% |
C (clang++ -O2) | pointer | 1.29 | 1.05 | 0.23 | 101% | 28% |
C++ (clang++ -O2) | std::string.find() | 1.52 | 1.31 | 0.19 | 119% | 34% |
C++ (g++ -O2) | std::string.find() | 1.52 | 1.34 | 0.17 | 119% | 34% |
C (g++ -O0) | pointer | 1.97 | 1.69 | 0.26 | 154% | 43% |
C++ (g++ -O2) | boost regex | 2.98 | 2.78 | 0.18 | 238% | 67% |
pypy | pointer | 3.10 | 2.89 | 0.19 | 242% | 68% |
pypy | perl regex | 3.30 | 3.07 | 0.22 | 257% | 73% |
pypy | str.split() | 3.50 | 3.27 | 0.23 | 273% | 77% |
perl | perl regex | 4.53 | 4.34 | 0.17 | 354% | 100% |
python | str.split() | 4.53 | 4.34 | 0.17 | 354% | 100% |
python | perl regex | 7.37 | 7.16 | 0.18 | 576% | 163% |
python | pointer | 32.96 | 32.71 | 0.16 | 2575% | 728% |
All code is listed at the bottom of the article.
Versions: gcc 4.7.2, clang 3.0, perl 5.14.2, python 2.7.3, pypy 1.9.0, boost regex 1.49; everything stock from Ubuntu 12.10.
When I say “pointer” in the case of C it’s really advancing the pointer through the string and comparing character by character. AFAIK, you can’t go any faster than that, could you? I cheated and used std::map in the C case, because I was lazy and didn’t want to implement hash map myself, or include external libraries. I went the extra mile and used the map with char* instead of string though, so I guess it’s as fast as possible.
Although I use the same term “pointer” with python, it’s obviously not the same thing, because it’s not simple character array as in the C case, but a python str object, which is much higher level etc., so it is expected to be very slow (25x slower than O2 optimized C). The very curious thing is, that when you run the python “pointer” variant through pypy, it is only 2.4 times slower than optimized C, which is impressive! Obviously, it has some serious drawbacks too: you have to run it specifically through pypy, or you are going to be penalized heavily if you run it through standard python, and it’s also almost as verbose as the C variant — but still, you get the benefits of not needing to compile + not having to deal with pointers and all the low level stuff.
One pretty decent compromise in terms of flexibility and speed seems to be C++ with boost regex. Code is quite short and convenient, while only 2.4x slower than optimized C. On the other hand pypy with perl regex is really close behind, so I may still prefer the no-need-to-compile-and-less-arcane syntax solution.
Some day I may try extracting some more complex stats, like grouping by hits, traffic, referrers, requests etc. with pypy vs. perl vs. C++. I guess differences are going to be even larger.
Here’s the code:
g++ -Wall -O0 access.c -o access-g++-O0
#include <stdio.h> #include <stdlib.h> #include <string> #include <map> #include <iostream> #include <cstring> using namespace std; struct cmp_str : public std::binary_function<const char*, const char*, bool> { public: bool operator() (const char* str1, const char* str2) const { return std::strcmp(str1, str2) < 0; } }; int main() { #define BUFSZ 8192 char line[BUFSZ]; char vhost[BUFSZ]; char size[BUFSZ]; int i, p; map <const char*, int, cmp_str> vhosts; while(fgets(line, BUFSZ, stdin)) { for (i=0; i<BUFSZ && line[i] != '\0' && line[i] != ' '; i++) { vhost[i] = line[i]; } vhost[i] = '\0'; // fast fwd two '"' for (/* keep from prev. for */; i<BUFSZ && line[i] != '\0' && line[i] != '"'; i++); for (i++; i<BUFSZ && line[i] != '\0' && line[i] != '"'; i++); // fast fwd two ' ' for (/* keep from prev. for */; i<BUFSZ && line[i] != '\0' && line[i] != ' '; i++); for (i++; i<BUFSZ && line[i] != '\0' && line[i] != ' '; i++); p = 0; for (i++; i<BUFSZ && line[i] != '\0' && line[i] != ' '; i++) { size[p] = line[i]; p++; } size[p] = '\0'; vhosts[strdup(vhost)] += strtoul(size, NULL, 10); } cout << "vhosts:" << vhosts.size() << endl; return 0; }
–
g++ -Wall -O2 access.cpp -o access-cpp-g++-O2
#include <string> #include <map> #include <iostream> #include <cstdlib> using namespace std; int main() { string line; string vhost; string size; map <string, size_t> vhosts; size_t b, e; ios::sync_with_stdio(false); while (getline(cin, line)) { e = line.find(' '); vhost = line.substr(0, e); b = line.find('"', e+1); b = line.find(' ', b+1); b = line.find(' ', b+1); e = line.find(' ', b+1); size = line.substr(b, e); vhosts[vhost] += strtoul(size.c_str(), NULL, 10); } cout << "vhosts:" << vhosts.size() << endl; return 0; }
–
g++ -Wall -O2 access-boostre.cpp -o access-boostre-g++-O2 -lboost_regex
#include <string> #include <map> #include <iostream> #include <boost/regex.hpp> #include <cstdlib> int main() { std::string line; std::string vhost; std::string size; boost::regex re ("^\\([^[:space:]]+\\) [^[:space:]]+ [^[:space:]]+ [^[:space:]]+ .*? \".*?\" [[:digit:]]+ \\([^[:space:]]+\\)", boost::regex::emacs); boost::cmatch m; std::string::const_iterator start, end; std::map <std::string, size_t> vhosts; std::ios::sync_with_stdio(false); while (getline(std::cin, line)) { start = line.begin(); end = line.end(); if (boost::regex_search(line.c_str(), m, re)) { vhost = m[1]; size = m[2]; } else { std::cout << "Not found." << std::endl; } vhosts[vhost] += strtoul(size.c_str(), NULL, 10); } std::cout << "vhosts:" << vhosts.size() << std::endl; return 0; }
–
#!/usr/bin/perl use warnings; use strict; my %stats; while (<STDIN>) { /^(\S+) \S+ \S+ \S+ \[.*?\] ".*?" \d+ (\S+)/; $stats{$1} += $2; } printf "vhosts:%d\n", scalar keys %stats;
–
#!/usr/bin/pypy import sys import re def f1(): stats = {} prog = re.compile(r'^(\S+) \S+ \S+ \S+ \[.*?\] ".*?" \d+ (\S+)'); for line in sys.stdin: m = prog.match(line) (vhost, size) = m.groups() if vhost not in stats: stats[vhost] = 0 else: stats[vhost] += int(size) return len(stats.keys()) def f2(): stats = {} for line in sys.stdin: vhost = line.split(" ", 1)[0] split = line.split('"', 2) size = int(split[2].split(" ", 3)[2]) if vhost not in stats: stats[vhost] = 0 else: stats[vhost] += size return len(stats.keys()) def f3(): stats = {} for line in sys.stdin: llen = len(line)-1 for i in range(0, llen): if (line[i] == ' '): vhost = line[0:i] break for i in range(i+1, llen): if (line[i] == '"'): break for i in range(i+1, llen): if (line[i] == '"'): break for i in range(i+1, llen): if (line[i] == ' '): break for i in range(i+1, llen): if (line[i] == ' '): break s = i for i in range(i+1, llen): if (line[i] == ' '): size = int(line[s:i]) break if vhost not in stats: stats[vhost] = 0 else: stats[vhost] += size return len(stats.keys()) if len(sys.argv) < 2: print "Usage: <fN>\n" exit(1) stats = locals()[sys.argv[1]]() print "vhosts:%d" % (stats)
–