This is a program to provide a list of ip addresses from a set of logfiles that aren't in a cache file. The logfiles and cache file both start with the ip address followed by a space. The perl version uses a ton of ram (over 1.7GB with a 200MB cache file) so I wrote the ocaml version to see if the ocaml hash table implementation was better. It uses about 315MB with the same input.

perl version

open FILE, shift or die $!;
while (<FILE>) {
    my ($a) = split;
    $h{$a} = undef;
close FILE;

while (<>) {
    my ($a) = split;
    $out{$a}++ unless defined $h{$a};
for (keys(%out)) { print "$_\n"; } }'

ocaml version

download uniqip.ml
build with: ocamlopt -pp "camlp4r" -o uniqip uniqip.ml
or run with: ocaml -I camlp4 camlp4r.cma uniqip.ml
open Hashtbl;
open String;
open Unix;

value ips = Hashtbl.create 2000000;
value notfound = Hashtbl.create 30000;

value split str =
    let spidx = String.index str ' ' in
    let a = String.sub str 0 spidx in
    let b = String.sub str (spidx+1) ((String.length str)-(spidx+1)) in
    (a, b)
  with [ Not_found -> (str, "") ];

value process func fn =
  let inc = open_in fn in
    while True do {
      let str = input_line inc in
      let ss = split str in

      func (fst ss) (snd ss);
  [ End_of_file -> close_in inc ];

value main () =
  let populate a b =
    if String.length b > 0 then
      Hashtbl.replace ips a b
      () in

  let lookup a b =
    try ignore (Hashtbl.find ips a)
    with [ Not_found -> Hashtbl.replace notfound a 0 ] in

  let logfiles = 
    Array.to_list (Array.sub Sys.argv 2 ((Array.length Sys.argv)-2)) in

  let dumpkey k v = print_endline k in

  do {
    (* load the cache *)
    process populate Sys.argv.(1);

    (* process logfiles *)
    List.iter (process lookup) logfiles;

    (* print the ips that weren't found *)
    Hashtbl.iter dumpkey notfound;

if Array.length Sys.argv < 3 then
  Printf.eprintf "usage : uniqips cachefile logfile1 .. logfileN\n"

python version


# Python Version 2.2.2
# Reads cachefile in localdir
# Processes stdin for new IP
# Sends IPs not found to stdout

import sys

f = open(sys.argv[1])
dnshash = { }
notfound = { }

for line in f.xreadlines():
   ip = line.split()[0]
   dnshash[ip] = None

for line in sys.stdin.xreadlines():
   ip = line.split()[0]
   if not ip in dnshash and not ip in notfound:
      notfound[ip] = None
      print ip