#!/usr/bin/perl

# arp monitor cronjob:
# - periodically check
#   - ping -b -c 2 {eth0 broadcast address}
#   - ping -b -c 1 {eth0 gateway}
# - test
#   - arp -n -i {eth0}
# - if (HWaddress) of {eth0 gateway} is (incomplete) then reboot

use strict;


# timeout before reboot after continuous loss of log information (seconds)
my $theMaxBadlinkTime = 600;

# log file path
my $theLogBase = "/cronjobs/arp_monitor";

# program path locations
my $ARP = "/usr/sbin/arp";
my $CAT = "/bin/cat";
my $IFCONFIG = "/sbin/ifconfig";
my $PING = "/bin/ping";
my $ROUTE = "/sbin/route";

my $REBOOT = "/sbin/reboot";

# location of /proc/uptime
my $PROC_UPTIME = "/proc/uptime";

# the interface we want is passed on the command line
die "need to supply interface name (e.g. eth0) on command line\n" if ($#ARGV < 0);
my $theIF = $ARGV[0];

# this flag tells us if we should execute the reboot handler
my $theRebootFlag = 0;
my $theRebootReason = "";

# now that we have the interface, figure out what the broadcast address is using 'ifconfig'

my @ifconfig_out = `$IFCONFIG $theIF`;
my $theBroadcast = "unknown";
foreach my $theLine (@ifconfig_out) {
  #print "line: $theLine";
  if ($theLine =~ /Bcast:(\S+)/) {
    $theBroadcast = $1;
    #print "broadcast address is $theBroadcast\n";
  }
}
die "could not get broadcast address\n" if ($theBroadcast eq "unknown");

# now get the gateway router using 'route'

my @route_out = `$ROUTE -n`;
my $theGateway = "unknown";
foreach my $theLine (@route_out) {
  #print "line: $theLine";
  if ($theLine =~ /^0\.0\.0\.0\s+(\S+)\s+0\.0\.0\.0\s+UG\s+\d+\s+\d+\s+\d+\s+$theIF$/) {
    $theGateway = $1;
    #print "gateway is $theGateway\n";
  }
}
if ($theGateway eq "unknown") {
  # no gateway, so we should reboot
  #print "could not get gateway\n";
  $theRebootReason = "could not get gateway" if ($theRebootReason eq "");
  $theRebootFlag = 1;
}

# get uptime from proc

my $proc_out = `$CAT $PROC_UPTIME`;
my $theUptime = "unknown";
if ($proc_out =~ /^(\d+)/) {
  $theUptime = $1;
  #print "uptime is $theUptime\n";
}
die "could not get uptime\n" if ($theUptime eq "unknown");

# now run ping commands to fill up arp cache
my @pingb_out = `$PING -b -c 2 -I $theIF $theBroadcast 2> /dev/null`;
foreach my $theLine (@pingb_out) {
  #print "line: $theLine";
  if ($theLine =~ /0 received/) {
    # this is really bad because we don't have a ping reply
    # from the default gateway
    #print "reboot request from ping broadcast\n";
    $theRebootReason = "reboot request from ping broadcast" if ($theRebootReason eq "");
    $theRebootFlag = 1;
  }
}

my @pingg_out = `$PING -c 1 -I $theIF $theGateway`;
foreach my $theLine (@pingg_out) {
  #print "line: $theLine";
  if ($theLine =~ /0 received/) {
    # this is really bad because we don't have a ping reply
    # from the default gateway
    #print "reboot request from ping default gateway\n";
    $theRebootReason = "reboot request from ping default gateway" if ($theRebootReason eq "");
    $theRebootFlag = 1;
  }
}

# now look at the arp cache
my @arp_out = `$ARP -n -i $theIF`;
my $theGatewayAddress = "unknown";
foreach my $theLine (@arp_out) {
  #print "line: $theLine";
  if ($theLine =~ /^$theGateway\s+/) {
    #print "gateway arp entry: $theLine";
    if ($theLine !~ /(incomplete)/) {
      if ($theLine =~ /^$theGateway\s+\S+\s+(\S+)/) {
        $theGatewayAddress = $1;
        #print "gateway mac address is $theGatewayAddress\n";
      }
    }
  }
}
# if we don't have a hardware address for the gateway, something is wrong
# and we should reboot
if (($theGatewayAddress eq "unknown") || ($theGatewayAddress eq "(incomplete)")) {
  #print "reboot request from arp $theGatewayAddress\n";
  $theRebootReason = "reboot request from arp $theGatewayAddress" if ($theRebootReason eq "");
  $theRebootFlag = 1;
}

# if the reboot flag is set, we should log the time that it happened
# if the number of continuous log entries spans greater than a limit, then reboot

# log file locations
my $theLogError = "$theLogBase/$theIF.log";
my $theLogFirstError = "$theLogBase/$theIF.first";

my $theCurrentTime = time();

if ($theRebootFlag > 0) {
  # write to continuous error log
  open (LOGERROR, ">> $theLogError") || die ("Could not open file. $!");
  print LOGERROR "$theCurrentTime, $theUptime, $theRebootReason\n";
  close (LOGERROR);

  # if the first error file doesn't exist, the create and write to it
  if (-e $theLogFirstError) {
    # read time in the file and compare against current time
    open (LOGFIRSTERROR, $theLogFirstError);
    my $theFirstError = <LOGFIRSTERROR>;
    close(LOGFIRSTERROR);

    # if the time delta is exceeded, then reboot
    my $theTimeDifference = $theCurrentTime - $theFirstError;
    if ($theTimeDifference > $theMaxBadlinkTime) {
      # rebooting
      print "rebooting now ($REBOOT)\n";
      system($REBOOT);
    } else {
      #print "time difference is $theTimeDifference (less than $theMaxBadlinkTime)\n";
    }

  } else {
    # log file doesn't exist, so create it
    open (LOGFIRSTERROR, "> $theLogFirstError") || die ("Could not open file. $!");
    print LOGFIRSTERROR "$theCurrentTime\n";
    close (LOGFIRSTERROR);
  }

} else {
  # reboot flag is 0 is we can erase the first error file
  unlink($theLogFirstError) if (-e $theLogFirstError);
}

# monitor.pl script ends

