Новосибирский институт органической химии им. Н.Н. Ворожцова СО РАН

Лаборатория изучения механизмов органических реакций

EEEvsEEE


#!/usr/bin/perl -ws

our ($h,$help);
(my $program = $0) =~ s/^.*[\/\\]//;
if ($h || $help) {
  print "Usage: $program *.DFT.xyz vs. *.PM7.xyz and *.xTB.xyz

  Convention about xyz-files:
Each xyz-file must contain energy in the 2-nd line
Energy in the first substring matching float number (e.g. Energy -233.46145057).
Energy units is hartree by default.

'vs.', 'and' are reserved words. There should be no files with the these names.

File.txt may be instead of xyz-files. Txt-file must have '.txt' extension and 
contain one energy per line, e.g. 
grep 'HEAT OF FORMATION' *.PM7.arc > PM7.txt
$program *.DFT.xyz vs. PM7.txt

Options:
-u=units  'h' is hartree, 'k' is kcal
          For example, for arguments *.DFT.xyz vs. *.PM7.xyz and *.xTB.xyz
          -u=hkh (hartree for DFT and xTB, kcal for PM7)
-Edisp    Grimme's D3 correction (if 2nd line of xyz file contains 
          substring 'Edisp -9.79' (kcal/mol))
-min      Default behavior is to print energies with 1st point to be zero level.
          This option forces minimal energy to be zero level.
-no_print_E  Copmact output. Only statistics.
";
  exit;
}

our ($u,$Edisp,$min,$no_print_E);
#use Data::Dump;

my @fstr = map {[map {[split]} @$_]} map {[split ' and ']} split ' vs. ', "@ARGV"; 
# It's terrible!
# qw(1 2 and 3 4 vs. 5 6 and 7 8) => ([[1 2],[3 4]], [[5 6],[7 8]])
die "No or too many 'vs.'\n" if @fstr != 2;

my @units = split '', $u if $u;

my $num = qr/-?\d+(?:\.\d+)?/;


foreach my $vs (@fstr) {
  foreach my $and (@$vs) {
    #print "@$and\n";
    my $end = delete_eq_end($and);
    $end =~ s/\.xyz$//;
    $end =~ s/^\.//;
    $end = ' ' if $end eq '';
    my $unit = 'h';
    if ($u) {
      $unit = shift @units;
      die "-u=$u does not coorespond to arguments (less)\n" unless $unit;
    }
    $and = [$and, $unit,$end];
  }
}
die "-u=$u does not coorespond to arguments (more)\n" if $u && @units;

my (@en_dft,@en_pm,@name_dft,@name_pm);
foreach my $dft (@{$fstr[0]}) {
  my @en;
  if (@{$dft->[0]} == 1 && $dft->[0][0] =~ /\.txt$/) {
    my ($name,$en,$end) = en_txt($dft->[0][0],$dft->[1]);
#    $dft->[0] = $name;
    @en = @$en;
    $end =~ s/:$//;
    $end =~ s/(\.arc|\.out)$//;
    $end =~ s/^\.//;
    ($end = $dft->[0][0]) =~ s/\.txt// if $end eq '';
    $end = ' ' if $end eq '';
    $dft->[2] = $end;
  }
  else {
    my @mols_dft = read_molden(@{$dft->[0]});
    @en = energy(@mols_dft, $dft->[1]);
  }
  push @en_dft, \@en;
  push @name_dft, $dft->[2];
}
foreach my $pm (@{$fstr[1]}) {
  my @en;
  if (@{$pm->[0]} == 1 && $pm->[0][0] =~ /\.txt$/) {
    my ($name,$en,$end) = en_txt($pm->[0][0],$pm->[1]);
    #dd $name,$en,$end;
#    $pm->[0] = $name;
    @en = @$en;
    $end =~ s/:$//;
    $end =~ s/(\.arc|\.out)$//;
    $end =~ s/^\.//;
    ($end = $pm->[0][0]) =~ s/\.txt// if $end eq '';
    $end = ' ' if $end eq '';
    $pm->[2] = $end;
  }
  else {
    my @mols_pm = read_molden(@{$pm->[0]});
    @en = energy(@mols_pm, $pm->[1]);
  }
  push @en_pm, \@en;
  push @name_pm, $pm->[2];
}
#dd @name_pm; exit;

my $FFF = 15;
my $J = @{$en_dft[0]};
for (my $dft=0; $dft<@en_dft; $dft++) {
  die "Nonconsistent data $name_dft[$dft] and $name_dft[0]\n" if @{$en_dft[$dft]} != $J;
  for (my $pm=0; $pm<@en_pm; $pm++) {
    die "Nonconsistent data $name_pm[$pm] and $name_dft[0]\n" if @{$en_pm[$pm]} != $J;
  }
  if (! $no_print_E) {
    print map({sprintf " %${FFF}s",$_} $name_dft[$dft], @name_pm), "\n";
    for (my $j=0; $j<$J; $j++) {
      printf " %${FFF}.2f", $en_dft[$dft][$j];
      for (my $pm=0; $pm<@en_pm; $pm++) {
        printf " %${FFF}.2f", $en_pm[$pm][$j];
      }
      print "\n";
    }
    print "-"x((2+@en_pm)*(${FFF}+1)), "\n";
  }
  # print names
  print map({sprintf " %${FFF}s",$_} $name_dft[$dft], @name_pm, "$J conformers"), "\n";
  # print delta_E
  print map {sprintf " %${FFF}.2f", $_} 
  max(@{$en_dft[$dft]})-min(@{$en_dft[$dft]}),
  map {max(@$_)-min(@$_)} @en_pm;
  printf " %${FFF}s\n", "delta_E";
  # print Pearson and calculate MAE
  printf " %${FFF}s", "Pearson";
  my (@MAE);
  my @en1 = @{$en_dft[$dft]};
  for (my $i=0; $i<@en_pm; $i++) {
    my @en2 = @{$en_pm[$i]};
    my $R = correlation(\@en1, \@en2);
    my $N = 2;
    do{$N++ while 1-$R <= 10**(-$N)} if $R < 1;
    printf " %${FFF}.${N}f", $R;
    ($MAE[0][$i],$MAE[1][$i]) = MAE(\@en1, \@en2);
  }
  print "\n";
  # print Spearman
  printf " %${FFF}s", "Spearman";
  for (my $i=0; $i<@en_pm; $i++) {
    my @en2 = @{$en_pm[$i]};
    my $R = spearman(\@en1, \@en2);
    my $N = 2;
    do{$N++ while 1-$R <= 10**(-$N)} if $R < 1;
    printf " %${FFF}.${N}f", $R;
  }
  print "\n";
  # print MAE_min
  printf " %${FFF}s", "MAE_min"; 
  print map( {sprintf " %${FFF}.2f", $_} @{$MAE[0]}), "\n";
  # print MAE
  printf " %${FFF}s", "MAE"; 
  print map( {sprintf " %${FFF}.2f", $_} @{$MAE[1]}), "\n";
  print "\n" if $dft < $#en_dft;  
}
exit;


sub en_txt {
  #dd @_;
  my ($file,$unit) = @_;
  my $mult = 627.51 if $unit eq 'h';
  $mult = 1 if $unit eq 'k';
  
  my (@name,@E,$end);
  open L, '<', $file or die "Can't open $file: $!\n";
  while (<L>) {
    my ($name) = m/^\s*(\S+)/;
    my ($en) = /(?:^|\s|=)($num)(?:\s|$)/;
    $en *= $mult;
    if ($Edisp) {
      my ($Edisp) = m/\sEdisp\s+($num)/;
      $en += $Edisp if $Edisp;
    }
    push @E, $en;
    push @name, $name;
  }
  close L;
  #dd @name;
  $end = delete_eq_end(\@name);
  @E = map {$_-$E[0]} @E;
  if ($min) {
    my $m = min(@E);
    @E = map {$_-$m} @E;
  }
  return \@name,\@E,$end;
}

sub energy {
  my $unit = pop @_;
  my $mult = 627.51 if $unit eq 'h';
  $mult = 1 if $unit eq 'k';
  
  my @E;
  my @mols = @_;
  foreach my $mol (@mols) {
    my $E = $mol->[0]{Energy}*$mult;
    if ($Edisp) {
      #die "No Edisp in $mol->[0]{Name}\n" unless exists $mol->[0]{Edisp};
      $E += $mol->[0]{Edisp} if exists $mol->[0]{Edisp};
    }
    push @E, $E;
  }
  @E = map {$_-$E[0]} @E;
  if ($min) {
    my $m = min(@E);
    @E = map {$_-$m} @E;
  }
  return @E;
}

sub read_molden {
 ############################################################################
 ## Молекула -- ссылка на массив, 0-й элемент -- свойства, следующие - атомы.
 ## Свойства -- ссылка на хэш с ключами Energy, Symmetry
 ## Атом -- ссылка на массив [atom, x, y, z, ppm] (ppm может не быть).
 ##
 ## Читает xyz. Параметры - имена xyz-файлов. Если параметров нет, то <>.
 ## Возвращает массив найденных молекул.
 ############################################################################
  local @ARGV = @_ ? @_ : @ARGV;
  my @mols;
  my $line;
  LOOP:
  while ($line || defined($line = <>)) {
    #print $line;
    if ($line =~/^\s*(\d+)\s*$/) {
      my @mol;
      my $N = $1;
      last LOOP if eof();
      next LOOP if eof(ARGV);
      $line = <>;
      ($mol[0]{'Energy'}) = $line =~ /(?:^|\s|=)($num)(?:\s|$)/o;
      ($mol[0]{'Symmetry'}) = $line =~ /symm\S*\s+(\S+)/i;
      $line=~/\sG\(($num)\)\s+($num)/ && ($mol[0]{G} = $2);
      $line=~/\sZPE\s+($num)/ && ($mol[0]{ZPE} = $1);
      $line=~/\sEdisp\s+($num)/ && ($mol[0]{Edisp} = $1);
      for (my $i=1; $i<=$N; $i++) {
        last LOOP if eof();
        next LOOP if eof(ARGV);
        $line = <>;
        #print $line;
        if ($line =~ /^\s*([A-Z]{1,2})\s+($num)\s+($num)\s+($num)\s*(.*)/io) {
          $mol[$i] = [$1,$2,$3,$4,$5];
        } else {
          next LOOP;
        }
      }
      push @mols, \@mol;
      last LOOP if eof();
    } else {
      undef $line;
    }
  }
  return @mols;
}

sub min {
  my $min = 1e10;
  foreach my $val (@_) {
    $min = $val if $min > $val;
  }
  return $min;
}
sub max {
  my $max = -1e10;
  foreach my $val (@_) {
    $max = $val if $max < $val;
  }
  return $max;
}
sub sum {
  my $sum;
  foreach my $val (@_) {
    $sum += $val;
  }
  return $sum;
}
sub mean {
  return undef unless @_;
  return sum(@_)/@_;
}
sub stddev {
  return undef unless @_;
  my $mean = mean(@_);
  my $sum = 0;
  foreach my $val (@_) {
    $sum += ($val-$mean)**2;
  }
  return sqrt($sum/$#_);
}
sub covariance {
  my @X = @{$_[0]};
  my @Y = @{$_[1]};
  return undef unless @X;
  return undef if @X != @Y;
  my $meanX = mean(@X);
  my $meanY = mean(@Y);
  my $sum = 0;
  for (my $i=0; $i<@X; $i++) {
    $sum += ($X[$i]-$meanX)*($Y[$i]-$meanY);
  }
  return $sum/$#X;
}
sub correlation {
  my @X = @{$_[0]};
  my @Y = @{$_[1]};
  return undef unless @X;
  return undef if @X != @Y;
  return covariance(@_)/(stddev(@X)*stddev(@Y));
}
sub median {
  return undef unless @_;
  return $_[0] if @_==1;
  my @S = sort {$a<=>$b} @_;
  my $i1 = int($#S/2);
  my $i2 = $i1 + $#S%2;
  return ($S[$i1]+$S[$i2])/2;
}
sub MAE {
  my @X = @{$_[0]};
  my @Y = @{$_[1]};
  return undef unless @X;
  return undef if @X != @Y;
  my $medianXY = median(map {$X[$_]-$Y[$_]} (0..$#X));
  my ($sum,$sum0);
  for (my $i=0; $i<@X; $i++) {
    $sum += abs($X[$i]-$Y[$i]-$medianXY);
    $sum0 += abs($X[$i]-$Y[$i]);
  }
  return $sum/@X, $sum0/@X;
}
sub rank {
  my $i = 1;
  my $j = 1;
  my @rank = 
    map {$_->[0]} 
    sort {$a->[1]<=>$b->[1]} 
    map {[$j++,$_->[1]]} 
    sort {$a->[0]<=>$b->[0]} 
    map {[$_,$i++]} 
  @_;
  return @rank;
}
sub spearman {
  my @X = @{$_[0]};
  my @Y = @{$_[1]};
  return undef if @X != @Y;
  my $n = @X;
  my @rankX = rank(@X);
  my @rankY = rank(@Y);
  my $rs2 = 0;
  for (my $i=0; $i<@X; $i++) {
    $rs2 += ($rankX[$i]-$rankY[$i])**2;
  }
  return 1-6*$rs2/$n/($n**2-1);
}
sub delete_eq_end {
  my @ar = @{$_[0]};
  return '' if @ar == 1;
  my $end = '';
  while (1) {
    my @a = @ar;
    $a0 = substr $a[0], -1, 1, '';
    for (my $i=1; $i<@a; $i++) {
      my $ai = substr $a[$i], -1, 1, '';
      #print "$ai ne $a0\n";
      return '' if $ai eq '' && $a0 eq '';
      return reverse($end) if $ai ne $a0;
    }
    $end .= $a0;
    @ar = @a;
  }
}
sub delete_eq_begin {
  my @ar = @{$_[0]};
  return '' if @ar == 1;
  my $begin = '';
  while (1) {
    my @a = @ar;
    $a0 = substr $a[0], 0, 1, '';
    for (my $i=1; $i<@a; $i++) {
      my $ai = substr $a[$i], 0, 1, '';
      return '' if $ai eq '' && $a0 eq '';
      return $begin if $ai ne $a0;
    }
    $begin .= $a0;
    @ar = @a;
  }
}