#!/usr/bin/perl -w


#############################################################################
## This machine-generated file was created Tue Jul  9 14:02:28 2002.
## It was built from the following files:
##   triangulate.p		(dated Sun Jul  7 17:49:09 2002)
## 
## Changes to this file will be lost when it is rebuilt
#############################################################################

#
# $Id: //pentools/main/rpat/triangulate.p#1 $
#
# written by :	Stephen J. Friedl
#               Software Consultant
#               Tustin, California USA
#               steve@unixwiz.net / www.unixwiz.net
#
#		=== This code is in the public domain ===
#
#	This program processes the logfiles produced by the RPAT daemon
#	and attempts to "triangulate" the data to locate the abusers of
#	the anonymous proxies.

use strict;

my %REMOTES   = ();
my %NAMECACHE = ();
my %FOLDPROXY = ();

my $do_namelookup = 0;
my $do_foldproxy  = 0;
my $minproxy      = 5;
my $minhits       = 5;

$0 =~ s|..*/||;		# dump leading path of program name

foreach ( @ARGV )
{
	if ( m/^--help/i )                      # --help
	{
		print STDERR <<EOF;

usage: $0 [options]

  --help                show this help listing
  --namelookup          perform IP->name DNS lookups (slow!)
  --foldproxy           fold multiple proxies in a C block to one
  --minhits=N           ignore proxies that have less than N hits per remote
  --minproxy=N          ignore remotes with less than N different proxies

  SNMP records are read from standard input
EOF
		exit 1;
	}
	elsif ( m/^--namelookup$/ )             # --namelookup
	{
		$do_namelookup = 1;
	}
	elsif ( m/^--foldproxy$/ )              # --foldproxy
	{
		$do_foldproxy = 1;
	}
	elsif ( m/^--minproxy=(\d+)$/ )         # --minproxy=N
	{
		$minproxy = $1;
	}
	elsif ( m/^--minhits=(\d+)$/ )          # --minhits=N
	{
		$minhits = $1;
	}
	elsif ( m/^-/ )
	{
		die "ERROR: {$_} is invalid cmdline parameter (try --help)\n";
	}
	else
	{
		die "ERROR: {$_} is unknown cmdline parameter (try --help)\n";
	}
}

my $icount = 0;	# count of input SNMP records
my $rcount = 0;	# count of processed remote records

while ( <STDIN> )
{
	next unless m/^SNMP/;

	s/\s+$//;

	++ $icount;

	#----------------------------------------------------------------
	# DECODE INPUT
	#
	# split up the input stream on tabs and colons. The "SNMP" word
	# is there literally just to give us a way to grep for the "good"
	# lines from the file (avoiding error & status msgs), and the date
	# is not currently interesting to us. Maybe later.
	#
	# SNMP 20020501152902  210.227.151.130:137 0.0.0.0:43014 listen
	# -------------------  --------------- --- ------- ----- ------
	# 1                    2               3   4       5     6
	#

	my($snmp, $proxyip, $proxyport, $remoteip, $remoteport, $state)
		= split( m/[:\t]/ );

	# ----------------------------------------------------------------
	# "Listening" ports are not interesting - they don't represent any
	# possibly valid connections. Once "listen" has been dispensed with,
	# the status is no longer ever considered.
	#
	next if $state eq 'listen';

	# ----------------------------------------------------------------
	# Ignore all records with uninteresting IP addresses
	#
	next if is_private($proxyip)
	     || is_private($remoteip);

	next if     is_proxy($remoteport);	# ignore outbound proxy
	next unless is_proxy($proxyport);	# keep inbound proxy

	# ----------------------------------------------------------------
	# Convert the proxy and remote IP addresses to class C form. This
	# allows us to ignore any requests where the proxy and the remote
	# are in the same address block (e.g., *local* proxy users).
	#

	( my $Cproxyip  = $proxyip  ) =~ s/\.\d+$/.0/;
	( my $Cremoteip = $remoteip ) =~ s/\.\d+$/.0/;

	next if $Cremoteip eq $Cproxyip;


	$proxyip = foldproxy($proxyip)   if $do_foldproxy;

	# tally it!
	$REMOTES{ $remoteip } -> { $proxyip }++;

	++ $rcount;
}

printf STDERR "Read %d SNMP records, kept %d\n", $icount, $rcount;

# ------------------------------------------------------------------------
# Run through the list of all remotes and dump all proxies that have
# less than the minimum number of hits. These are really just incidental
# and probably not abusive.
#
# While we're at it, we also dump 
#

if ( $minhits > 1 )
{
	print STDERR "Dropping proxies with less than $minhits hits\n";

	my $dropped = 0;

	foreach my $remote ( keys %REMOTES )
	{
		my $proxylist = $REMOTES{$remote};

		foreach my $ip ( keys %{ $proxylist } )
		{
			if ( $proxylist->{$ip} < $minhits )
			{
				++ $dropped;

				delete $proxylist->{$ip};
			}
		}

		# does the remote itself have too few proxies?

		if ( scalar keys %{ $proxylist } < $minproxy )
		{
			delete $REMOTES{$remote};
		}
	}
}


print "\n";
print "REMOTES HITTING MULTIPLE PROXIES\n";

foreach my $target ( sort by_proxy_count keys %REMOTES )
{
	my $proxylist = $REMOTES{$target};

	next if scalar keys %{ $proxylist } < $minproxy;

	printf "    REMOTE %s%s\n",
		$target,
		namelookup($target, "    ");

	foreach my $ip ( sort by_ip keys %$proxylist )
	{
		printf "        PROXY %-15s %4d%s\n",
			$ip,
			$proxylist->{$ip},
			namelookup($ip, "  ");
	}

	printf "\n";
}

sub is_proxy
{

	my $port = shift;

	return 1 if $port == 1080  or  $port == 8080  or  $port == 8000;

	return 0;
}

#
# is_private
#
#	Is the given IP address in the "private" range that we don't care
#	to process? We include all the RFC1988 addresses plus a few others
#	that we've found don't contribute anything to triangulation.

sub is_private
{
	my $ip = shift;

	return 1 if  $ip =~ m/^(?: 10		# RFC 1918	10.0.0.0
	                        | 172.1[6789]	# RFC 1918	172.16 -> 172.19
	                        | 172.2\d	# RFC 1918	172.20 -> 172.29
	                        | 172.3[01]	# RFC 1918	172.30 -> 172.31
	                        | 192\.168	# RFC 1918	192.168.0.0
	                        | 127		# localhost
	                        | 169\.254	# link local
	                        | 22[4-9]	# multicast	224 -> 229
	                        | 2[345]\d	# multicast	230 -> 255
	                        ) \./x;

	return 0;
}

#
# namelookup
#
#	Given an IP address, attempt to look up the DNS name associated
#	it. This is a *hack* using the "dig" command.
#
#	If a second parameter is given, it's treated as a prefix that is
#	prepended to the looked-up name. The prefix is normally spaces,
#	so the caller can do:
#
#		printf " looked up %s%s\n",
#			$ip,
#			namelookup($ip, "   ");
#
#	and only add spaces at the end of the line if there is an
#	actual name.
#

sub namelookup
{

	my $ip  = shift;
	my $pfx = shift || "";

	return ""	if not $do_namelookup;

	if ( not defined $NAMECACHE{$ip} )
	{
		( my $lookup = `dig +short -x  $ip` ) =~ s/\s.*$//s;

		if ( $lookup )
		{
			$lookup =~ s/\.$//;

			$NAMECACHE{$ip} = $pfx . $lookup;
		}
		else
		{
			$NAMECACHE{$ip} = "";
		}
	}

	return $NAMECACHE{$ip};
}


#
# foldproxy
#
#	Given an IP address of a proxy server, "fold" it to a common
#	proxy server if others exist in the same C block. This treats
#	multiple proxy servers as one.
#

sub foldproxy
{
	my $ip = shift;

	( my $cblock = $ip ) =~ s/\.\d+$//;		# dump final octet

	if ( my $lookup = $FOLDPROXY{ $cblock } )
	{
		return $lookup;
	}
	else
	{
		$FOLDPROXY{ $cblock } = $ip;

		return $ip;
	}
}

#
# by_proxy_count
#
#	Given keys to the %REMOTES hash, return a comparison based
#	on how many different proxies are associated with each one.
#	The sort is *inverse* - larger counts collate earlier in the
#	list.
#

sub by_proxy_count {

	return scalar keys %{ $REMOTES{$b} }
	   <=> scalar keys %{ $REMOTES{$a} }
}

#
# by_ip
#
#	Compare the two IP addresses numerically by each octet.


sub by_ip {

	return 0	if $a eq $b;

	my @A = split(/[.:]/, $a);
	my @B = split(/[.:]/, $b);

	while ( @A )
	{
		my $rc = ( shift(@A) <=> shift(@B) );

		return $rc   if ($rc != 0);
	}
	return 0;
}

