Новосибирский институт органической химии им. Н.Н. Ворожцова СО РАН Лаборатория изучения механизмов органических реакций |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||
EEEvsEEE#!/usr/bin/perl -ws our ($h,$help); (my $program = $0) =~ s/^.*[\/\\]//; if ($h || $help) { print "Usage: $program *.DFT.xyz vs. *.PM7.xyz and *.xTB.xyz Convention about xyz-files: Each xyz-file must contain energy in the 2-nd line Energy in the first substring matching float number (e.g. Energy -233.46145057). Energy units is hartree by default. 'vs.', 'and' are reserved words. There should be no files with the these names. File.txt may be instead of xyz-files. Txt-file must have '.txt' extension and contain one energy per line, e.g. grep 'HEAT OF FORMATION' *.PM7.arc > PM7.txt $program *.DFT.xyz vs. PM7.txt Options: -u=units 'h' is hartree, 'k' is kcal For example, for arguments *.DFT.xyz vs. *.PM7.xyz and *.xTB.xyz -u=hkh (hartree for DFT and xTB, kcal for PM7) -Edisp Grimme's D3 correction (if 2nd line of xyz file contains substring 'Edisp -9.79' (kcal/mol)) -min Default behavior is to print energies with 1st point to be zero level. This option forces minimal energy to be zero level. -no_print_E Copmact output. Only statistics. "; exit; } our ($u,$Edisp,$min,$no_print_E); #use Data::Dump; my @fstr = map {[map {[split]} @$_]} map {[split ' and ']} split ' vs. ', "@ARGV"; # It's terrible! # qw(1 2 and 3 4 vs. 5 6 and 7 8) => ([[1 2],[3 4]], [[5 6],[7 8]]) die "No or too many 'vs.'\n" if @fstr != 2; my @units = split '', $u if $u; my $num = qr/-?\d+(?:\.\d+)?/; foreach my $vs (@fstr) { foreach my $and (@$vs) { #print "@$and\n"; my $end = delete_eq_end($and); $end =~ s/\.xyz$//; $end =~ s/^\.//; $end = ' ' if $end eq ''; my $unit = 'h'; if ($u) { $unit = shift @units; die "-u=$u does not coorespond to arguments (less)\n" unless $unit; } $and = [$and, $unit,$end]; } } die "-u=$u does not coorespond to arguments (more)\n" if $u && @units; my (@en_dft,@en_pm,@name_dft,@name_pm); foreach my $dft (@{$fstr[0]}) { my @en; if (@{$dft->[0]} == 1 && $dft->[0][0] =~ /\.txt$/) { my ($name,$en,$end) = en_txt($dft->[0][0],$dft->[1]); # $dft->[0] = $name; @en = @$en; $end =~ s/:$//; $end =~ s/(\.arc|\.out)$//; $end =~ s/^\.//; ($end = $dft->[0][0]) =~ s/\.txt// if $end eq ''; $end = ' ' if $end eq ''; $dft->[2] = $end; } else { my @mols_dft = read_molden(@{$dft->[0]}); @en = energy(@mols_dft, $dft->[1]); } push @en_dft, \@en; push @name_dft, $dft->[2]; } foreach my $pm (@{$fstr[1]}) { my @en; if (@{$pm->[0]} == 1 && $pm->[0][0] =~ /\.txt$/) { my ($name,$en,$end) = en_txt($pm->[0][0],$pm->[1]); #dd $name,$en,$end; # $pm->[0] = $name; @en = @$en; $end =~ s/:$//; $end =~ s/(\.arc|\.out)$//; $end =~ s/^\.//; ($end = $pm->[0][0]) =~ s/\.txt// if $end eq ''; $end = ' ' if $end eq ''; $pm->[2] = $end; } else { my @mols_pm = read_molden(@{$pm->[0]}); @en = energy(@mols_pm, $pm->[1]); } push @en_pm, \@en; push @name_pm, $pm->[2]; } #dd @name_pm; exit; my $FFF = 15; my $J = @{$en_dft[0]}; for (my $dft=0; $dft<@en_dft; $dft++) { die "Nonconsistent data $name_dft[$dft] and $name_dft[0]\n" if @{$en_dft[$dft]} != $J; for (my $pm=0; $pm<@en_pm; $pm++) { die "Nonconsistent data $name_pm[$pm] and $name_dft[0]\n" if @{$en_pm[$pm]} != $J; } if (! $no_print_E) { print map({sprintf " %${FFF}s",$_} $name_dft[$dft], @name_pm), "\n"; for (my $j=0; $j<$J; $j++) { printf " %${FFF}.2f", $en_dft[$dft][$j]; for (my $pm=0; $pm<@en_pm; $pm++) { printf " %${FFF}.2f", $en_pm[$pm][$j]; } print "\n"; } print "-"x((2+@en_pm)*(${FFF}+1)), "\n"; } # print names print map({sprintf " %${FFF}s",$_} $name_dft[$dft], @name_pm, "$J conformers"), "\n"; # print delta_E print map {sprintf " %${FFF}.2f", $_} max(@{$en_dft[$dft]})-min(@{$en_dft[$dft]}), map {max(@$_)-min(@$_)} @en_pm; printf " %${FFF}s\n", "delta_E"; # print Pearson and calculate MAE printf " %${FFF}s", "Pearson"; my (@MAE); my @en1 = @{$en_dft[$dft]}; for (my $i=0; $i<@en_pm; $i++) { my @en2 = @{$en_pm[$i]}; my $R = correlation(\@en1, \@en2); my $N = 2; do{$N++ while 1-$R <= 10**(-$N)} if $R < 1; printf " %${FFF}.${N}f", $R; ($MAE[0][$i],$MAE[1][$i]) = MAE(\@en1, \@en2); } print "\n"; # print Spearman printf " %${FFF}s", "Spearman"; for (my $i=0; $i<@en_pm; $i++) { my @en2 = @{$en_pm[$i]}; my $R = spearman(\@en1, \@en2); my $N = 2; do{$N++ while 1-$R <= 10**(-$N)} if $R < 1; printf " %${FFF}.${N}f", $R; } print "\n"; # print MAE_min printf " %${FFF}s", "MAE_min"; print map( {sprintf " %${FFF}.2f", $_} @{$MAE[0]}), "\n"; # print MAE printf " %${FFF}s", "MAE"; print map( {sprintf " %${FFF}.2f", $_} @{$MAE[1]}), "\n"; print "\n" if $dft < $#en_dft; } exit; sub en_txt { #dd @_; my ($file,$unit) = @_; my $mult = 627.51 if $unit eq 'h'; $mult = 1 if $unit eq 'k'; my (@name,@E,$end); open L, '<', $file or die "Can't open $file: $!\n"; while (<L>) { my ($name) = m/^\s*(\S+)/; my ($en) = /(?:^|\s|=)($num)(?:\s|$)/; $en *= $mult; if ($Edisp) { my ($Edisp) = m/\sEdisp\s+($num)/; $en += $Edisp if $Edisp; } push @E, $en; push @name, $name; } close L; #dd @name; $end = delete_eq_end(\@name); @E = map {$_-$E[0]} @E; if ($min) { my $m = min(@E); @E = map {$_-$m} @E; } return \@name,\@E,$end; } sub energy { my $unit = pop @_; my $mult = 627.51 if $unit eq 'h'; $mult = 1 if $unit eq 'k'; my @E; my @mols = @_; foreach my $mol (@mols) { my $E = $mol->[0]{Energy}*$mult; if ($Edisp) { #die "No Edisp in $mol->[0]{Name}\n" unless exists $mol->[0]{Edisp}; $E += $mol->[0]{Edisp} if exists $mol->[0]{Edisp}; } push @E, $E; } @E = map {$_-$E[0]} @E; if ($min) { my $m = min(@E); @E = map {$_-$m} @E; } return @E; } sub read_molden { ############################################################################ ## Молекула -- ссылка на массив, 0-й элемент -- свойства, следующие - атомы. ## Свойства -- ссылка на хэш с ключами Energy, Symmetry ## Атом -- ссылка на массив [atom, x, y, z, ppm] (ppm может не быть). ## ## Читает xyz. Параметры - имена xyz-файлов. Если параметров нет, то <>. ## Возвращает массив найденных молекул. ############################################################################ local @ARGV = @_ ? @_ : @ARGV; my @mols; my $line; LOOP: while ($line || defined($line = <>)) { #print $line; if ($line =~/^\s*(\d+)\s*$/) { my @mol; my $N = $1; last LOOP if eof(); next LOOP if eof(ARGV); $line = <>; ($mol[0]{'Energy'}) = $line =~ /(?:^|\s|=)($num)(?:\s|$)/o; ($mol[0]{'Symmetry'}) = $line =~ /symm\S*\s+(\S+)/i; $line=~/\sG\(($num)\)\s+($num)/ && ($mol[0]{G} = $2); $line=~/\sZPE\s+($num)/ && ($mol[0]{ZPE} = $1); $line=~/\sEdisp\s+($num)/ && ($mol[0]{Edisp} = $1); for (my $i=1; $i<=$N; $i++) { last LOOP if eof(); next LOOP if eof(ARGV); $line = <>; #print $line; if ($line =~ /^\s*([A-Z]{1,2})\s+($num)\s+($num)\s+($num)\s*(.*)/io) { $mol[$i] = [$1,$2,$3,$4,$5]; } else { next LOOP; } } push @mols, \@mol; last LOOP if eof(); } else { undef $line; } } return @mols; } sub min { my $min = 1e10; foreach my $val (@_) { $min = $val if $min > $val; } return $min; } sub max { my $max = -1e10; foreach my $val (@_) { $max = $val if $max < $val; } return $max; } sub sum { my $sum; foreach my $val (@_) { $sum += $val; } return $sum; } sub mean { return undef unless @_; return sum(@_)/@_; } sub stddev { return undef unless @_; my $mean = mean(@_); my $sum = 0; foreach my $val (@_) { $sum += ($val-$mean)**2; } return sqrt($sum/$#_); } sub covariance { my @X = @{$_[0]}; my @Y = @{$_[1]}; return undef unless @X; return undef if @X != @Y; my $meanX = mean(@X); my $meanY = mean(@Y); my $sum = 0; for (my $i=0; $i<@X; $i++) { $sum += ($X[$i]-$meanX)*($Y[$i]-$meanY); } return $sum/$#X; } sub correlation { my @X = @{$_[0]}; my @Y = @{$_[1]}; return undef unless @X; return undef if @X != @Y; return covariance(@_)/(stddev(@X)*stddev(@Y)); } sub median { return undef unless @_; return $_[0] if @_==1; my @S = sort {$a<=>$b} @_; my $i1 = int($#S/2); my $i2 = $i1 + $#S%2; return ($S[$i1]+$S[$i2])/2; } sub MAE { my @X = @{$_[0]}; my @Y = @{$_[1]}; return undef unless @X; return undef if @X != @Y; my $medianXY = median(map {$X[$_]-$Y[$_]} (0..$#X)); my ($sum,$sum0); for (my $i=0; $i<@X; $i++) { $sum += abs($X[$i]-$Y[$i]-$medianXY); $sum0 += abs($X[$i]-$Y[$i]); } return $sum/@X, $sum0/@X; } sub rank { my $i = 1; my $j = 1; my @rank = map {$_->[0]} sort {$a->[1]<=>$b->[1]} map {[$j++,$_->[1]]} sort {$a->[0]<=>$b->[0]} map {[$_,$i++]} @_; return @rank; } sub spearman { my @X = @{$_[0]}; my @Y = @{$_[1]}; return undef if @X != @Y; my $n = @X; my @rankX = rank(@X); my @rankY = rank(@Y); my $rs2 = 0; for (my $i=0; $i<@X; $i++) { $rs2 += ($rankX[$i]-$rankY[$i])**2; } return 1-6*$rs2/$n/($n**2-1); } sub delete_eq_end { my @ar = @{$_[0]}; return '' if @ar == 1; my $end = ''; while (1) { my @a = @ar; $a0 = substr $a[0], -1, 1, ''; for (my $i=1; $i<@a; $i++) { my $ai = substr $a[$i], -1, 1, ''; #print "$ai ne $a0\n"; return '' if $ai eq '' && $a0 eq ''; return reverse($end) if $ai ne $a0; } $end .= $a0; @ar = @a; } } sub delete_eq_begin { my @ar = @{$_[0]}; return '' if @ar == 1; my $begin = ''; while (1) { my @a = @ar; $a0 = substr $a[0], 0, 1, ''; for (my $i=1; $i<@a; $i++) { my $ai = substr $a[$i], 0, 1, ''; return '' if $ai eq '' && $a0 eq ''; return $begin if $ai ne $a0; } $begin .= $a0; @ar = @a; } } |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||