This is a program to provide a list of ip addresses from a set of logfiles that aren't in a cache file. The logfiles and cache file both start with the ip address followed by a space. The perl version uses a ton of ram (over 1.7GB with a 200MB cache file) so I wrote the ocaml version to see if the ocaml hash table implementation was better. It uses about 315MB with the same input.
#!/usr/bin/perl
open FILE, shift or die $!;
while (<FILE>) {
my ($a) = split;
$h{$a} = undef;
}
close FILE;
while (<>) {
my ($a) = split;
$out{$a}++ unless defined $h{$a};
}
for (keys(%out)) { print "$_\n"; } }'
(*
build with: ocamlopt -pp "camlp4r" -o uniqip uniqip.ml
or run with: ocaml -I camlp4 camlp4r.cma uniqip.ml
*)
open Hashtbl;
open String;
open Unix;
value ips = Hashtbl.create 2000000;
value notfound = Hashtbl.create 30000;
value split str =
try
let spidx = String.index str ' ' in
let a = String.sub str 0 spidx in
let b = String.sub str (spidx+1) ((String.length str)-(spidx+1)) in
(a, b)
with [ Not_found -> (str, "") ];
value process func fn =
let inc = open_in fn in
try
while True do {
let str = input_line inc in
let ss = split str in
func (fst ss) (snd ss);
}
with
[ End_of_file -> close_in inc ];
value main () =
let populate a b =
if String.length b > 0 then
Hashtbl.replace ips a b
else
() in
let lookup a b =
try ignore (Hashtbl.find ips a)
with [ Not_found -> Hashtbl.replace notfound a 0 ] in
let logfiles =
Array.to_list (Array.sub Sys.argv 2 ((Array.length Sys.argv)-2)) in
let dumpkey k v = print_endline k in
do {
(* load the cache *)
process populate Sys.argv.(1);
(* process logfiles *)
List.iter (process lookup) logfiles;
(* print the ips that weren't found *)
Hashtbl.iter dumpkey notfound;
};
if Array.length Sys.argv < 3 then
Printf.eprintf "usage : uniqips cachefile logfile1 .. logfileN\n"
else
main();
#!/usr/bin/python2
# Python Version 2.2.2
# Reads cachefile in localdir
# Processes stdin for new IP
# Sends IPs not found to stdout
import sys
f = open(sys.argv[1])
dnshash = { }
notfound = { }
for line in f.xreadlines():
ip = line.split()[0]
dnshash[ip] = None
f.close()
for line in sys.stdin.xreadlines():
ip = line.split()[0]
if not ip in dnshash and not ip in notfound:
notfound[ip] = None
print ip