#!/usr/bin/perl -w
#use Statistics::Lite qw(all);
# A. Wood, August 2006
# E. Rogers Jan 2007, commented out line to convert cfs to af for observed data
#                     incoming observed data in KAF, edited to convert sim flow to KAF
# *******************************************************************************/
#  This script is based on the sflow_statistics.pl script that calculates a range
#  of statistics related to simulated and observed flow.  In this case, though, it
#  outputs only those used by the MOCOM-UA procedure to optimize a run.
#
#  Inputs:
#   <sim file> <obs file> <results file> <avg startdate> <avg enddate> <void>
#   Input flow files should contain 3 columns. <year><mon><flow in CFS>
#   start and end dates specify period used for analysis, and should contain
#   data for both records
#
#  Outputs:
#   program writes out a stats file, but prints several stats to STDOUT that are
#   captured in MOCOM's "R2FILE" for use by the optimizer.
#
#   Stats calculated:
#   Pearson's r =
#      (sum(xi*yi) - sum(xi)*sum(yi)/n)/(sqrt(sum(xi^2)- (sum(xi))^2/n) *
#          sqrt(sum(yi^2)-(sum(yi))^2/n)
#   mean & std. dev for simulation and observation
#   Root mean square error (RMSE)
#   Model efficiency (Nash and Sutcliff 1970 ?)
#        both for raw & log flows (used by optimizer)
#   Various monthly mean statistics
#   Number of Sign Changes (NSC) in Sim-Obs residual
#   Absolute peak difference from mean monthly hydrographs
#
#   Stats used by optimizer:
#   NSC, RMSE, peak difference (all monthly)
#
#  Author:    A.Wood, August 2006
# *******************************************************************************/

# -------- process arguments --------------
if(@ARGV != 7) {
  die "USAGE: <sim file> <obs file> <results file> <avg startdate> <avg enddate> <void> <statsfile>\n  Flow file fmt: <yr><mon><flow>; Date fmt: YYYYMM\n";
}
open(SIM, "<$ARGV[0]") or die "Can't open $ARGV[0]: $!\n";
open(OBS, "<$ARGV[1]") or die "Can't open $ARGV[1]: $!\n";
open(STATS, ">$ARGV[2]") or die "Can't open $ARGV[2]: $!\n";
($AVG_SYR, $AVG_SMO) = (substr($ARGV[3],0,4), substr($ARGV[3],4,2));
($AVG_EYR, $AVG_EMO) = (substr($ARGV[4],0,4), substr($ARGV[4],4,2));
$void = $ARGV[5];
my $statsfilename = $ARGV[6];

# ------ initialize
@DaysInMon = (-99, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31); #no leapyr
$sum_obs = $sum_sim = $sum_obssim = $sum_obs_sq = $sum_sim_sq = 0;
$sum_ln_obs = $sum_ln_sim = 0;  # for nat log of flows
$bias_ss = $modelef1 = $modelef2 = 0;
$bias_ss_ln = $modelef1_ln = $modelef2_ln = 0;
$obs_annavg = $sim_annavg = $daycnt = 0;
@obs = @sim = @obs_ym = @sim_ym = @mon = @yr = ();
@bias_ss_mon = @simresid_ss_mon = @obsresid_ss_mon = ();
for($m=1;$m<13;$m++) {
  $mcnt[$m] = $obs_ym_avg[$m] = $sim_ym_avg[$m] = 0;
}

print STDOUT "$AVG_SYR, $AVG_SMO, $AVG_EYR, $AVG_EMO\n";

# ------ read obs datafile -----
$n = 0;
while(<OBS>) {
    chomp;
  ($y, $m, $q) = split /\s+/;
  if( ($y>$AVG_SYR || ($y==$AVG_SYR && $m>=$AVG_SMO)) &&
      ($y<$AVG_EYR || ($y==$AVG_EYR && $m<=$AVG_EMO)) ){
    if($q!=$void) {

# observed files in acre feet (KAF) 
#($yr[$n],$mon[$n],$obs[$n]) = ($y,$m,$q); # kaf      
($yr[$n],$mon[$n],$obs[$n]) = ($y,$m,($q*1.9835*$DaysInMon[$m])/1000); # cfs->kaf

      $ln_obs[$n] = log($obs[$n] + 1);  # also take natural log of flows
    } else {
      ($yr[$n],$mon[$n],$obs[$n]) = ($y,$m,$q);  # don't convert voids
      $ln_obs[$n] = $q;
    }
    $n++;
  }
}
close(OBS);
$nobs = $n;


# ------ read sim datafile -----
$n = 0;
while(<SIM>) {
  ($y, $m, $q) = split;
  if( ($y>$AVG_SYR || ($y==$AVG_SYR && $m>=$AVG_SMO)) &&
      ($y<$AVG_EYR || ($y==$AVG_EYR && $m<=$AVG_EMO)) ) {
    if($q!=$void) {
      $sim[$n] = ($q*1.9835*$DaysInMon[$m])/1000; # cfs->kaf
      $ln_sim[$n] = log($sim[$n] + 1); # also take natural log of flows
    } else {
      $sim[$n] = $q;
      $ln_sim[$n] = $q;
    }
    $n++;
  }
}
close(SIM);

if($nobs != $n) {
  die "input flow records insufficient for averaging period. check flows\n";
}

# ----- now cycle through records and calculate stats --------

$nvalid=0;
for($n=0;$n<$nobs;$n++) {
  if($obs[$n]!=$void && $sim[$n]!=$void) {
    $sum_obs += $obs[$n];
    $sum_sim += $sim[$n];
    $sum_ln_obs += $ln_obs[$n];  # log values
    $sum_ln_sim += $ln_sim[$n];  # log values
    $obs_annavg += $obs[$n];  # for ann tots only
    $sim_annavg += $sim[$n];  #
    $daycnt += $DaysInMon[$mon[$n]];               #
    $obs_ym[$y-$yr[0]][$mon[$n]] = $obs[$n];  # arrays indexed by y, m
    $sim_ym[$y-$yr[0]][$mon[$n]] = $sim[$n];  # not really used yet
    $obs_ym_avg[$mon[$n]] += $obs[$n];     # monthly averaging variables
    $sim_ym_avg[$mon[$n]] += $sim[$n];
    $mcnt[$mon[$n]]++;
    $nvalid++;
  }
}

# ------ calculate means, peak differences ---------------------------
$obs_peak = $sim_peak = 0;
$mean_obs = $sum_obs / $nvalid;
$mean_sim = $sum_sim / $nvalid;
$mean_ln_obs = $sum_ln_obs / $nvalid;  # nat log
$mean_ln_sim = $sum_ln_sim / $nvalid;  #
$obs_annavg /= $daycnt;
$sim_annavg /= $daycnt;
for($m=1;$m<13;$m++) {
  $obs_ym_avg[$m] /= $mcnt[$m];
  $sim_ym_avg[$m] /= $mcnt[$m];
  # find peaks
  if($obs_ym_avg[$m] > $obs_peak) {
    $obs_peak = $obs_ym_avg[$m];
  }
  if($sim_ym_avg[$m] > $sim_peak) {
    $sim_peak = $sim_ym_avg[$m];
  }
}

# ------ calculate std. dev, correlations, etc. ------------
for($n=0;$n<$nobs;$n++) {
  if($obs[$n]!=$void && $sim[$n]!=$void) {
    $sum_obssim += $obs[$n] * $sim[$n];
    $sum_obs_sq += $obs[$n]**2;
    $sum_sim_sq += $sim[$n]**2;
    # for NS efficiency
    $bias_ss  += ($obs[$n] - $sim[$n])**2;
    $modelef1 += ($sim[$n] - $mean_sim)**2;
    $modelef2 += ($obs[$n] - $mean_obs)**2;
    # for NS efficiency of nat log
    $bias_ss_ln  += ($ln_obs[$n] - $ln_sim[$n])**2;
    $modelef1_ln += ($ln_sim[$n] - $mean_ln_sim)**2;
    $modelef2_ln += ($ln_obs[$n] - $mean_ln_obs)**2;
    # monthly sums of squares
    $bias_ss_mon[$mon[$n]] += ($obs[$n] - $sim[$n])**2;
    $simresid_ss_mon[$mon[$n]] += ($sim[$n] - $mean_sim)**2;
    $obsresid_ss_mon[$mon[$n]] += ($obs[$n] - $mean_obs)**2;
  }
}
$sd_sim = sqrt($modelef1/$n);
$sd_obs = sqrt($modelef2/$n);

$r1 =     ($sum_obssim - ($sum_obs * $sum_sim)/$n);
$r2 = sqrt($sum_obs_sq - ($sum_obs**2)/$n);
$r3 = sqrt($sum_sim_sq - ($sum_sim**2)/$n);
$r  = $r1/($r2*$r3);
$rmse    = sqrt($bias_ss/$n);
#$modelef    = 1 - $bias_ss/$modelef1;  # NS Efficiency (ERROR in denom?)
$modelef    = 1 - $bias_ss/$modelef2;  # NS Efficiency
$modelef_ln = 1 - $bias_ss_ln/$modelef2_ln;  # NS Efficiency - nat log flows
for($m=1;$m<13;$m++) {
  $rmse_mon[$m]   = sqrt($bias_ss_mon[$m]/$mcnt[$m]);
  $sd_obs_mon[$m] = sqrt($obsresid_ss_mon[$m]/$mcnt[$m]);
  $sd_sim_mon[$m] = sqrt($simresid_ss_mon[$m]/$mcnt[$m]);
}

# ------ calculate number of sign changes ------------
# analyze no-void sections of timeseries
if($obs[0]!=$void && $sim[0]!=$void && (($obs[0]-$sim[0]) != 0)) {
  $sign = $curr_sign = ($obs[0]-$sim[0])/abs(($obs[0]-$sim[0]));
} else {
  $sign = $curr_sign = 1;
}
$NSC=0;
for($n=1;$n<$nobs;$n++) {
  if($obs[$n]!=$void && $sim[$n]!=$void) {
    if($obs[$n]!=$sim[$n] && (($obs[$n]-$sim[$n]) != 0)) {
      $curr_sign = ($obs[$n]-$sim[$n])/abs(($obs[$n]-$sim[$n]));
    }
    if($curr_sign != $sign) {
      $NSC++;
    }
    $sign = $curr_sign;
  }
}

# %%%%%%%%%%%%%%% print output for optimizer %%%%%%%%%%%%%%%%%%%
# optimizer expects first value to be R^2 *-1 (probably either
# correlation coefficient or Nash Sutcliff efficiency will work (a 1.0
# is perfect for both) and will read additional stats to be minimized.
# How many are used in the optimization depends on setting "num_tests" in
# main script, run_cal.scr, which becomes N_TEST_FUNCS in the MOCOM program.

# AWW-aug06:  It's not clear that the first item output, "R-sqr",
#             is used in the optimization
#
# outputs for optim:  r^2, NSeff, ann_avg_vol_err,
#                     NSeff(natlog q),abs(peak err),
#                     RMSE, #_sign_chgs

open(STATSFILE, ">$statsfilename") or die "Can't open $statsfilename for writing: $!\n";

printf STATSFILE "%.3f %.3f %.3f %.3f %.3f %.3f\n",
    -1.0*$modelef,
    -1.0*$modelef_ln,
    abs(($sim_annavg / $obs_annavg) - 1),
    -1.0*($r*$r),
#    abs($sim_peak-$obs_peak),
#    abs($rmse);
    abs(($sim_peak/$obs_peak)-1),
    abs($rmse/$mean_obs);
#   $NSC;

# %%%%%%%%%%%%%%% write output %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
printf STATS "Calculated stats for period:  %d %d to %d %d\n\n",
  $yr[0],$mon[0],$yr[$n-1],$mon[$n-1];
printf STATS "                         Obs             Sim   Sim/Obs\n";
printf STATS "Avg Flow (KAF)   %12.1f    %12.1f      %3.2f\n",
 $obs_annavg, $sim_annavg, $sim_annavg/$obs_annavg;
printf STATS "Std Dev (KAF)    %12.1f    %12.1f      %3.2f\n\n",
 $sd_obs, $sd_sim, $sd_sim/$sd_obs;
printf STATS "Correlation Coefficient = %.2f\n", $r;
printf STATS "Nash-Sutcliff Effc      = %.2f\n", $modelef;
printf STATS "N.S. Effc natlog flows  = %.2f\n", $modelef_ln;
printf STATS "RMSE/Obs Mean           = %.2f\n", ($rmse/$mean_obs);
printf STATS "MSE/Obs Var             = %.2f\n\n", ($rmse**2/$sd_obs**2);
printf STATS "RMSE (KAF)               = %.1f\n", $rmse;
printf STATS "Abs Avg Peak Diff (KAF)  = %.1f\n", abs($sim_peak-$obs_peak);
printf STATS "Number of Sign Changes  = %d\n\n", $NSC;

printf STATS "Monthly Stats:\n\n";
printf STATS "Mon\tObsAvg\t\tSimAvg\t\tBias\t\tRMSE\t\tObsStDev\tSimStDev\n";
for($m=1;$m<13;$m++) {
  printf STATS "%2d %14.1f %14.1f %14.1f %14.1f %14.1f %14.1f\n",
    $m, $obs_ym_avg[$m], $sim_ym_avg[$m], $sim_ym_avg[$m]-$obs_ym_avg[$m],
    $rmse_mon[$m], $sd_obs_mon[$m], $sd_sim_mon[$m];
}

