#!/usr/bin/env perl

# Copyright (c) 2001-2004 The Trustees of Indiana University.  
#                         All rights reserved.
# Copyright (c) 1998-2001 University of Notre Dame. 
#                         All rights reserved.
# Copyright (c) 1994-1998 The Ohio State University.  
#                         All rights reserved.
# 
# This file is part of the LAM/MPI software package.  For license
# information, see the LICENSE file in the top level directory of the
# LAM/MPI source distribution.
# 
# $HEADER$
#
# Function:	- start an MPI application
#

use strict;
use warnings;
use File::Temp qw(tempfile tempdir);
use Data::Dumper;
use POSIX;

# Global variables

my $host_map;
my $as_fh;
my $as_fn = "";
my $host_arch_hash_created = 0;
my $mpirun_arg = "";
my %mpiexec_arg;
my $verbose_mode = "";
my $boot_flag = "-H";
my $debug_mode = "";
my $tv_mode = "";
my $ssi_args = "";
my $configfile = "";
my $lamboot_on = 0;
my $prefix_boot = 0;
my $prefix_path = "";
my $prefix_bin = "";
my $prefix_arg = "";
my $machinefile = "";
my $bootargs = "";
my $arg_index = 0;
my @local_args;
my $current_state = "";
my $booted_lam = 0;
my $pid;
my $child_waitpid_status;
my $lamhalt_timeout = 10;     # If lamhalt does not complete in 10 secs we 
                              # interrupt it and use lamwipe to lamhalt


# Specify signal handler for SIGINT, SIGTERM and SIGHUP

$SIG{INT} = \&sig_handler_INT;
$SIG{TERM} = \&sig_handler_TERM;
$SIG{HUP} = \&sig_handler_HUP;

# Parse global args

parse_global_args();
if ( $debug_mode ne "" )  {
    print "mpiexec: Global argument parsing done\n";
}


# lamboot if -machinefile or -boot specified 
# -machinefile specifies the host file to be passed to lamboot
# -boot indicates that we should lamboot however if -machinefile is
# not specified then no host file is passed to lamboot

if ( $lamboot_on == 1 ) {

# Cannot use system() as it does not propogate the signals to child
# process

# Cannot use backtics cause even though it propogates signal to child
# process we get runaway lamboot not sure why..

    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
	print "mpiexec: Booting lam..\n";
    }
    $current_state = "lamboot";
    $booted_lam = 1;

    # Subroutine for fork-exec-waitpid (few) to fork-exec lamboot
    
    few();

    $current_state = "";
    undef ($pid);
    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
	print "mpiexec: Lamboot Complete\n";
    }
} 

# Create hostname => node_no hash 
# This will print error and exit if "lamnodes"fail's 
# So when user had not lambooted before executing mpiexec and
# has not specified -machinefile then lamnodes will fail and we will
# print error and exit

create_host_node_hash();
if ( $debug_mode ne "" )  {
    print "mpiexec: Host-Node Number hash created\n";
}

# Create temp file for appschema (UNLINK makes sure file is deleted on exit
my $tmpdir;
if (exists($ENV{TMPDIR})) {
    $tmpdir = $ENV{TMPDIR};
} else {
    $tmpdir = "/tmp";
}
($as_fh, $as_fn) = tempfile($tmpdir . "/lam_appschema_XXXXXX");
if ( $debug_mode ne "" )  {
    print "mpiexec: Temporary file $as_fn created " . 
	   "(will be used as app schema file for mpirun)\n";
}

# Parse non global command line args

if ($configfile ne "") {
    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
	print "mpiexec: Parsing config file $configfile \n";
    }
    parse_file($configfile);
} else { 
    @local_args = @ARGV[$arg_index .. $#ARGV];
    parse_arg(@local_args);
}

close($as_fh) || 
    mpiexec_die("Cannot close temp appshema file $as_fn!!!! $! \n");

# Execute mpirun

if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
    print "mpiexec: Launching MPI programs\n";
}
$current_state = "mpirun";

# Subroutine for fork-exec-waitpid (few) to fork-exec mpirun

few();

$current_state = "";
undef ($pid);
if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
    print "mpiexec: MPI program execution over..\n";
}

# Delete temp app cshema file we had created

if ( $debug_mode ne "" )  {
    print "mpiexec: deleting temprory file $as_fn\n";
}
unlink($as_fn) || 
    mpiexec_die("Cannot delete temp appshema file $as_fn!!!! $! \n");
$as_fn = "";

# lamhalt if we lambooted from within mpiexec

if ( $booted_lam == 1 ) {
    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
	print "mpiexec: Performing Lamhalt\n";
    }

# Our own function which takes care of Ignoring signals SIGINT,
# SIGTERM and SIGHUP during lamhalt. Also has a timeout mechanism
# where in if lamhalt does not complete within specified number of
# seconds it interrupts lamhalt and runs lamwipe to cleanup LAM RTE

    my_lamhalt();
}

exit(0);



######################################################################
## Subroutines from here down
######################################################################


# Our die function

sub mpiexec_die {
    if ( $debug_mode ne "" )  {
	print "mpiexec: mpiexec_die called\n";
    }

    # Delete temp app cshema file we had created
    
    if ( $as_fn ne "" ) {
	if ( $debug_mode ne "" )  {
	    print "mpiexec: deleting temprory file $as_fn\n";
	}
	unlink($as_fn);
	$as_fn = "";
    }

    # lamhalt if we lambooted from within mpiexec
    
    if ( $booted_lam == 1 ) {
        
# Our own function which takes care of Ignoring signals SIGINT,
# SIGTERM and SIGHUP during lamhalt. Also has a timeout mechanism
# where in if lamhalt does not complete within specified number of
# seconds it interrupts lamhalt and runs lamwipe to cleanup LAM RTE
	
	my_lamhalt();
    }
    
    die(@_);
}


# Signal Handlers

# SIGINT signal handler

sub sig_handler_INT {
    
    # Reinstall sighandler

    $SIG{INT} = \&sig_handler_INT;
    
    if ( $debug_mode ne "" )  {
	print "mpiexec: inside SIGINT handler\n";
    }

    # parent forwards the signal to fork-exec'ed lamboot process
    
    if ( ( $current_state ne "" ) && ( $pid ) ) {
	print "sending sig_INT to $current_state \n";
	kill INT => $pid;
	waitpid($pid, 0);
	
        # Handle waitpid status
	
	$child_waitpid_status = $?;
	handle_waitpid_status( $child_waitpid_status, $current_state );

    }
    
    # call generic signal handler

    sig_handler();

}


# SIGTERM signal handler

sub sig_handler_TERM {
    
    # Reinstall sighandler

    $SIG{TERM} = \&sig_handler_TERM;
    
    if ( $debug_mode ne "" )  {
	print "mpiexec: inside SIGTERM handler\n";
    }

    # parent forwards the signal to fork-exec'ed lamboot process
    
    if ( ( $current_state ne "" ) && ( $pid ) ) {
	kill TERM => $pid;
	waitpid($pid, 0);

        # Handle waitpid status
	
	$child_waitpid_status = $?;
	handle_waitpid_status( $child_waitpid_status, $current_state );

    }
    
    # call generic signal handler

    sig_handler();

}


# SIGHUP signal handler

sub sig_handler_HUP {
    
    # Reinstall sighandler

    $SIG{HUP} = \&sig_handler_HUP;

    if ( $debug_mode ne "" )  {
	print "mpiexec: inside SIGHUP handler\n";
    }
    
    # parent forwards the signal to fork-exec'ed lamboot process
    
    if ( ( $current_state ne "" ) && ( $pid ) ) {
	kill HUP => $pid;
	waitpid($pid, 0);

        # Handle waitpid status
	
	handle_waitpid_status( $?, $current_state );

    }
    
    # call generic signal handler

    sig_handler();

}


# SIGALRM signal handler

sub sig_handler_ALRM { 

    # Reinstall sighandler

    $SIG{ALRM} = \&sig_handler_ALRM;

    if ( $debug_mode ne "" )  {
	print "mpiexec: inside SIGALRM handler\n";
    }

    die "timeout\n";

}; 


# generic signal handler
 
sub sig_handler {
    
    # MRC: This stuff will change !!!!!! 
    # We might just get rid of sig_handler and have sig_handler_* 
    # just call mpiexec_die
    
    mpiexec_die("\n");

}


# Handle waitpid status

sub handle_waitpid_status {

    my $waitpid_status = $_[0];
    my $wait_func = $_[1];
    my $p_exit_status;
    
    if ( $debug_mode ne "" )  {
	print "mpiexec: Inside handle_waitpid_status " . 
	    "Function: $wait_func, Error Status: $waitpid_status\n";
    }

    if ( WIFEXITED($waitpid_status) ) {
	if ( ( $p_exit_status = WEXITSTATUS($waitpid_status) ) > 0 ) {
	    mpiexec_die "$wait_func failed with exit status $p_exit_status\n";
	}
    } 
    if ( WIFSIGNALED($waitpid_status) ) {
	mpiexec_die "$wait_func was killed by signal " . 
	    WTERMSIG($waitpid_status) . "\n";
    }
}


# Parse global args 

sub parse_global_args { 

    # Check for the bozo case of no command line arguments

    if ($#ARGV == -1) {
        show_help();
        exit(0);
    }

    # look for -test flag as first arg .. this is internal flag used by
    # lam configure script to find if appropriate version of Perl is
    # available

    if ($ARGV[0] eq "-test") {
	exit(0);
    }

    # look for global args -v, -d, -machinefile, -boot, -boot-args
    # and, -configfile
    
    while ( defined($ARGV[$arg_index]) ) {
	if ($ARGV[$arg_index] eq "-v") {
	    $verbose_mode = "-v";
            $boot_flag = "";
	} elsif ($ARGV[$arg_index] eq "-d") {
	    $debug_mode = "-d";
            $boot_flag = "";
	} elsif ($ARGV[$arg_index] eq "-configfile") {
	    $configfile = $ARGV[++$arg_index];
	} elsif ($ARGV[$arg_index] eq "-machinefile") {
	    $machinefile = $ARGV[++$arg_index];
	    $lamboot_on = 1;
	} elsif ($ARGV[$arg_index] eq "-boot") {
	    $lamboot_on = 1;
	} elsif ($ARGV[$arg_index] eq "-prefix") {
	    $prefix_boot = 1;
	    $prefix_path = $ARGV[++$arg_index];
	    $prefix_bin = "$prefix_path/bin/";
	} elsif ($ARGV[$arg_index] eq "-boot-args") {
	    $bootargs = $ARGV[++$arg_index];
	    $lamboot_on = 1;
	} elsif ($ARGV[$arg_index] eq "-tv") {
	    $tv_mode = "-tv";
	} elsif ($ARGV[$arg_index] eq "-ssi") {
            $ssi_args = "$ssi_args -ssi " . $ARGV[++$arg_index] . " " . 
                "\"" . $ARGV[++$arg_index] . "\"";
	} elsif ($ARGV[$arg_index] eq "-h") {
	    show_help();
	    exit(0);
	} else {
	    return;
	}
	$arg_index++;
    }

}


# Create hostname => node_no hash

sub create_host_node_hash {
    my $first = 1;
    my $line;
    my $node_number;
    my $hostname;

    open (NODELIST, $prefix_bin."lamnodes 2>&1 |") || 
	mpiexec_die("Cannot run Lamnodes! $!\n",
		    "Check if you had booted lam before calling mpiexec ",
		    "else pass -machinefile to mpiexec\n");
    while ($line = <NODELIST>) {
	chomp($line);
	if ( $first && ($line =~ /^-----------/) ) {
	    close(NODELIST);
	    mpiexec_die("Lamnodes Failed!\n",
			"Check if you had booted lam before calling mpiexec ",
			"else use -machinefile to pass host file to mpiexec",
			"\n");
	}
	$first = 0;
	($node_number, $hostname) = ($line =~ /^(.*?)\s+(.*?):/);
	$host_map->{$hostname}->{"lamnode"} = $node_number;
    }
    close(NODELIST);
}


# Create hostname => archtype hash

sub create_host_arch_hash {
    my $line;
    my $hostname;
    my $bogus;
    my $archtype;

    open (CMD, "lamexec N laminfo -arch -parsable -hostname |") ||
	mpiexec_die("Cannot run lamexec! \n");
    while ($line = <CMD>) {
	chomp($line);
	($hostname, $bogus, $archtype) = split(/:/, $line);
	$host_map->{$hostname}->{"arch"} = $archtype;
    }
    close(CMD);
    if ( $debug_mode ne "" )  {
	print "mpiexec: Here is hostname => (Node Number, Architecture) " .
	    "mapping\n";
	print Dumper($host_map);
    }
    $host_arch_hash_created = 1;
}


# Parse command line args

sub parse_arg {
    my (@my_ARGV) = @_; 
    my $i;
    my $arg;
    my $unrecognized_arg = 0;

    $i = 0;
    $arg = $my_ARGV[$i];

    while ($i <= $#my_ARGV) {
	if ( (! $unrecognized_arg ) && ($arg eq "-n") ) {
	    mpiexec_die("mpiexec arg -n cannot be used twice within same ",
			"context \n") 
		if ( defined($mpiexec_arg{"n"}) );
	    $mpiexec_arg{"n"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-host") ) {
	    mpiexec_die("mpiexec arg -host cannot be used twice within same ",
			"context \n") 
		if ( defined($mpiexec_arg{"host"}) );
	    $mpiexec_arg{"host"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-arch") ) {
	    mpiexec_die("mpiexec arg -arch cannot be used twice within same ",
			"context \n") 
		if ( defined($mpiexec_arg{"arch"}) );
	    $mpiexec_arg{"arch"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-wdir") ) {
	    mpiexec_die("mpiexec arg -wdir cannot be used twice within same ",
			"context \n") 
		if ( defined($mpiexec_arg{"wdir"}) );
	    $mpiexec_arg{"wdir"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-soft") ) {
	    $mpiexec_arg{"soft"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-path") ) {
	    $mpiexec_arg{"path"} = $my_ARGV[++$i];
	} elsif ( (! $unrecognized_arg ) && ($arg eq "-file") ) {
	    $mpiexec_arg{"file"} = $my_ARGV[++$i];
	} elsif ($arg eq ":") {
	    $arg = $my_ARGV[$i+1];
	    if (! defined($arg)) {
		mpiexec_die("Error in parsing line :\n",
			    join(" ", @my_ARGV), 
			    "\nYou cannot have : at end of line\n");
	    }

	    #write a line in appschema

	    app_schema_insert();

	    # Free up (empty) %mpiexec_arg and $mpirun_arg

	    undef %mpiexec_arg;
	    undef $mpirun_arg;
	    $mpirun_arg = "";
	    $unrecognized_arg = 0;
	} else {

	    # arg not recognized by mpiexec... so assume its mpirun arg

	    $unrecognized_arg = 1;
	    $mpirun_arg = $mpirun_arg . $arg . " ";
	}
	$arg = $my_ARGV[++$i];
    }
    
    # write a line in appschema

    app_schema_insert();

    # Free up (empty) %mpiexec_arg and $mpirun_arg

    undef %mpiexec_arg;
    undef $mpirun_arg;
    $mpirun_arg = "";
}


# Parse configfile

sub parse_file {
    my $temp_line;
    my @temp_argv;
    my $line_continued = 0;
    my $prev_line = "";

    # Open config file for reading

    open (CONF_FILE, "$_[0]") || 
	mpiexec_die("Cannot open config file $_[0] : $!\n");
    
    while ($temp_line = <CONF_FILE>) {

	# ignore commented lines in configfile

	if ($temp_line =~ /^\s*#/) {
	    next; 
	}

	# If previous line is being continued..

	if ($line_continued) {
	    $temp_line = $prev_line . $temp_line;
	    $prev_line = "";
	    $line_continued = 0;
	}

	# check if this line will be continued..

	if ($temp_line =~ /\\$/) {
	    $temp_line =~ s/\\$//;
	    $prev_line = $temp_line;
	    chomp($prev_line);
	    $line_continued = 1;
	    next;
	}
	chomp($temp_line);
	@temp_argv = split(/\s+/, $temp_line);
	parse_arg(@temp_argv);
    }
    
    close (CONF_FILE) || 
	mpiexec_die("Cannot close config file $_[0] : $!\n");

}


# Write a line in appschema file

sub app_schema_insert {
    my $found = 0;
    my $host;

    # if -host was specified we need to convert it to <where> option
    # for appschema

    if ($mpiexec_arg{"host"}) {

	if ($host_map->{$mpiexec_arg{"host"}}) {
	    print $as_fh $host_map->{$mpiexec_arg{"host"}}->{"lamnode"}
	    , " ";
	} else {
	    mpiexec_die "Invalid hostname " . $mpiexec_arg{"host"} .
		" specified\n";
	}
    }

    # if -arch was specified we need to convert it to <where> option
    # for appschema

    if ($mpiexec_arg{"arch"}) {
	if (! $host_arch_hash_created) {
	    create_host_arch_hash();
	}

	foreach $host ( keys( %{$host_map} ) ) {
	    if ($host_map->{$host}->{"arch"} =~ /$mpiexec_arg{"arch"}/) { 
		print $as_fh $host_map->{$host}->{"lamnode"} . " "; 
		$found = 1; 
	    } 
	} 
	
	if (! $found) { 
	    mpiexec_die("mpiexec didnt find any node with -arch = ",
			$mpiexec_arg{"arch"}, " \n");
	} 
    }

    # if -n was specified we need to convert it to -np option for appschema

    if ($mpiexec_arg{"n"}) {
	print $as_fh "-np " . $mpiexec_arg{"n"} . " ";
    }

    # if -wdir was specified we need to convert it to -wd option for
    # appschema

    if ($mpiexec_arg{"wdir"}) {
	print $as_fh "-wd " . $mpiexec_arg{"wdir"} . " ";
    }

    # if -file was specified we need to print warning that it was
    # ignored

    if ($mpiexec_arg{"file"}) {
	print "Warning!!! \"-file " . $mpiexec_arg{"file"} . 
	    "\" was ignored by mpiexec\n";
    }
    
    # stuff we don't handle yet
    
    if ( $mpiexec_arg{"soft"} || $mpiexec_arg{"path"} ) {
	mpiexec_die "mpiexec's -soft and -path arguments are not ",
	"currently supported \n";
    }
    
    # Finish the line

    print $as_fh $mpirun_arg . "\n";
    
}


# Our own function which takes care of Ignoring signals SIGINT,
# SIGTERM and SIGHUP during lamhalt. Also has a timeout mechanism
# where in if lamhalt does not complete within specified number of
# seconds it interrupts lamhalt and runs lamwipe to cleanup LAM RTE

sub my_lamhalt {

    # ignore signals INT, TERM and HUP during lamhalt
	
    $SIG{INT} = 'IGNORE';
    $SIG{TERM} = 'IGNORE';
    $SIG{HUP} = 'IGNORE';
    
    $SIG{ALRM} = \&sig_handler_ALRM;

    # We have to use eval here because on timeout sig_handler_ALRM
    # gets invoked and calls die. eval forces output of die to go into
    # $@ instead of exiting the program with that error message.. kind
    # of try catch

    eval {
  	alarm ($lamhalt_timeout);
	system($prefix_bin."lamhalt $boot_flag $verbose_mode $debug_mode") &&
	    die("Cannot lamhalt !!!! $! \n");
	alarm (0); # clear the still pending alarm
    };
    
    if ($@) {
	
	if ($@ =~ /timeout/) {
	    
	    # lamboot timed out.. so use lamwipe 
	    
	    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
		print "mpiexec: Lamhalt operation timedout .. trying lamwipe\n";
	    }
	    system("lamwipe $verbose_mode $debug_mode $machinefile") &&
		die("Cannot lamwipe !!!! $! \n");
	    if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
		print "mpiexec: lamwipe completed\n";
	    }
	} 
	else {

	    # something else caused alarm
	    # propagate unexpected errors

	    alarm(0); # clear the still pending alarm
	    die "\n";

	}
    }	
    else { 

        # lamboot didn't time out
 	
	if ( ( $verbose_mode ne "" ) || ( $debug_mode ne "" ) ) {
	    print "mpiexec: Lamhalt complete\n";
	}
    }
    $booted_lam = 0;
    
    # reinstall signal handlers for signals INT, TERM and HUP
    
    $SIG{INT} = \&sig_handler_INT;
    $SIG{TERM} = \&sig_handler_TERM;
    $SIG{HUP} = \&sig_handler_HUP;

}


# Subroutine for fork-exec-waitpid (few)

sub few {

    if ($pid = fork) {
	waitpid($pid, 0);
	
	# check exit status of child process
	
	$child_waitpid_status = $?;
	handle_waitpid_status( $child_waitpid_status, $current_state );
	
    } else {
	die "Cannot fork: $!\n" 
	    unless defined $pid;
	
	# exec lamboot

	if ( $current_state eq "lamboot" ) {
	    if ( $prefix_boot == 1) {
		$prefix_arg = "-prefix $prefix_path";
	    } 
            if ( $verbose_mode ne "" ) {
                print "Running: " . "lamboot $boot_flag $verbose_mode $debug_mode $bootargs $ssi_args $prefix_arg $machinefile";
            }
	    exec("lamboot $boot_flag $verbose_mode $debug_mode $bootargs $prefix_arg $machinefile") 
		|| die "mpiexec cannot exec lamboot $machinefile: $!\n"; 
	}

	# exec mpirun

	if ( $current_state eq "mpirun" ) {
            if ( $verbose_mode ne "" ) {
                print "Running: " . $prefix_bin."mpirun $ssi_args $verbose_mode $tv_mode $as_fn";
            }
	    exec($prefix_bin."mpirun $ssi_args $verbose_mode $tv_mode $as_fn") ||
		mpiexec_die("mpiexec cannot exec mpirun: $!\n"); 
	}
    }
}


# Help message

sub show_help {
    print "-----------------------------------------------------------------------------
Synopsis:       mpiexec [global_args] local_args1 [: local_args2 [...]]
                mpiexec [global_args] -configfile <filename>

Description:    Run MPI programs on LAM nodes.

Global arguments:
   -h                  This message
   -boot               Boot the LAM universe before running
   -boot-args <args>   Pass <args> to the booting agent
   -d                  Lots of debugging output
   -machinefile <file> Use boot schema <filename> to boot the LAM universe
   -prefix <lam/install/path>
                       Use the LAM installation specified in </lam/install/path/>
   -ssi <key> <value>  Set SSI parameter <key> to value <value>
   -tv                 Launch MPI process under the TotalView debugger
   -v                  Be verbose

Local arguments:
   -n <numprocs>       Specify how many processes to start
   -host <hostname>    Launch on a specific hostname
   -arch <arch>        Launch on a specific architecture type
   -wdir <dir>         Set working directory
   <other_arguments>   Passed back to the MPI executable
-----------------------------------------------------------------------------\n";
}
