Новосибирский институт органической химии им. Н.Н. Ворожцова СО РАН

Лаборатория изучения механизмов органических реакций

cif2xyz


#!/usr/bin/perl -ws

our ($h,$help,$cell,$cell_corners,$move_to_cell_center,$exclude_initial,
     $all_inside_cell);
if ($h || $help) {
  (my $program = $0) =~ s/^.*[\/\\]//;
  print "Usage: $program [options] *.cif
Dependencies: perl
  Convert cif to MDL xyz format (filename.cif to filename.xyz)
The program prints filename to sdtout and warns (to stderr) if fractional
coordinates block of cif-file contains not exactly one molecule.

Options:
-cell=MxNxK  fill unit cell by symmetry operations and then
             multiply the cell M,N,K times by a,b,c directions
             (initial unit cell places in the centre of supercell)
-cell=N      equal -cell=NxNxN
-cell        equal -cell=1x1x1 (only pack unit cell)
-move_to_cell_center  move molecules included in unit cell to the cell center  
-exclude_initial      f.e. for -cell=3x3x3 make 26 unit cells (without central)
-cell_corners         add dummies XX in the unit cell corners
-all_inside_cell  place all atoms given in cif-file inside the unit cell
                  (as obabel does, not a whole molecule is obtained).
                  This option is too little tested.

   The program takes fractional coordinates 'as is' without trying to grow
molecules or eliminate disorder. So, it doesn't operate correctly with the 
symmetric molecules (not a whole molecules are obtained). A sign of disorder 
or not-a-whole-molecule is warning 'Formulas are not coincided' (however, 
occasionally this warning is caused by an error in _chemical_formula_sum).
";
  exit;
}

#use Data::Dump 'pp';
my $num = qr/\d+(?:\.\d+)?/;

FILE:
foreach my $file (@ARGV) {
  (my $name = $file) =~ s/\.cif//i;
  print "$name\n";
  my ($formula,$alpha,$beta,$gamma,$Z,$a,$b,$c,@frac);
  my @loops;
  
  open L, '<', $file or die "Can't open $file: $!\n";
  LLL:
  while (<L>) {
    s/^\s+//;
    m/^_chemical_formula_sum\s+(.+)/ and $formula = $1;
    m/^_cell_angle_alpha\s+(\S+)/ and $alpha = $1;
    m/^_cell_angle_beta\s+(\S+)/ and $beta = $1;
    m/^_cell_angle_gamma\s+(\S+)/ and $gamma = $1;
    m/^_cell_formula_units_Z\s+(\S+)/ and $Z = $1;
    m/^_cell_length_a\s+(\S+)/ and $a = $1;
    m/^_cell_length_b\s+(\S+)/ and $b = $1;
    m/^_cell_length_c\s+(\S+)/ and $c = $1;
    #print "$c\n";
    if (m/^loop_/) {
      my @loop;
      while (<L>) {
        s/^\s+//;
        if (/^_/) {
          chomp;
          push @{$loop[0]}, $_;
        }
        else {
          chomp;
          push @loop, $_;
          last;
        }
      }
      while (<L>) {
        s/^\s+//;
        if (/^_/ || /^loop_/) {
          push @loops, \@loop;
          #pp @loop;
          redo LLL;
        }
        elsif (eof || /^\s*$/) {
          chomp;
          push @loop, $_;
          push @loops, \@loop;
          #pp @loop;
          redo LLL;
        }
        else {
          chomp;
          push @loop, $_;
        }
      }
    }
  }
  close L;
  
  foreach ($a,$b,$c,$alpha,$beta,$gamma) {
    s/\(.+//;
    unless ($_) {
      warn "$file: Not cif?\n";
      next FILE;
    }
  }
  #warn "$a,$b,$c,$alpha,$beta,$gamma\n";
  
  my %formula;
  if ($formula) {
    $formula =~ s/^'(.+)'$/$1/;
    $formula =~ s/([A-Za-z])(?=\s|$)/${1}1/g;
    %formula = map {/([A-Za-z]+)($num)/ && (ucfirst lc $1,$2)} split ' ', $formula;
    #pp %formula; exit;
  }
  else {
    warn "$file: No _chemical_formula_sum\n";
  }
  
  #pp @loops;
  my @symmop;
  foreach my $loop (@loops) {
    #pp $loop;
    if (grep {/_symmetry_equiv_pos/ || /_space_group_symop/} @{$loop->[0]}) {
      my $head = shift @$loop;
      foreach (@$loop) {
        s/^\s*\d+\s+//;
        s/^'(.+)'$/$1/;
        s/\s+//g;
        tr/XYZ/xyz/;
        #s/(?<=^|,)\+//g;
        s/(^|,)\K\+//g;
      }
      #pp $loop; exit;
      @symmop = @$loop;
    }
    if (grep {/_atom_site_fract/} @{$loop->[0]}) {
      #pp $loop;
      my $head = shift @$loop;
      my $N = @$head;
      my @body = map {split} @$loop;
      my @table;
      for (my $i=0; $i<@body; $i++) {
        $table[int($i/$N)][$i%$N] = $body[$i];
      }
      if (@{$table[-1]} != $N) {
        warn "$file: Somewhat bad in loop_ where fractional coordinates. 
The number of values in the body must be a multiple of the number of tags in the header.\n";
        next FILE;
      }
      #pp $head; exit;
      
      my ($nlabel,$nsymbol,$nx,$ny,$nz);
      for (my $i=0; $i<@$head; $i++) {
        $head->[$i] eq '_atom_site_label' and $nlabel = $i;
        $head->[$i] eq '_atom_site_type_symbol' and $nsymbol = $i;
        $head->[$i] eq '_atom_site_fract_x' and $nx = $i;
        $head->[$i] eq '_atom_site_fract_y' and $ny = $i;
        $head->[$i] eq '_atom_site_fract_z' and $nz = $i;
      }
      if ( !defined($nlabel) && !defined($nsymbol) ) {
        warn "$file: Label tag in headers of loop_ where fractional coordinates?\n";
        next FILE;
      }
      
      for (my $i=0; $i<@table; $i++) {
        my $atom;
        if (defined $nsymbol) {
          $atom = $table[$i][$nsymbol];
        }
        else {
          ($atom = $table[$i][$nlabel]) =~ s/[^A-Za-z].*//;
        }
        my ($fx,$fy,$fz) = @{$table[$i]}[$nx,$ny,$nz];
        unless ($atom || $fx || $fy || $fz) {
          warn "$file: Somewhat bad in headers of loop_ where fractional coordinates\n";
          next FILE;
        }
        foreach ($fx,$fy,$fz) {
          s/\(.*//;
        }
        push @frac, [ucfirst lc $atom,$fx,$fy,$fz];
      }
    }
#    else {
#      warn "$file: No fractional coordinates\n";
#      next FILE;
#    }
  }
  #pp @frac;
  
  # Compare atoms in fractional coordinates with _cheical_formula_sum
  my %formula_frac;
  foreach my $val (@frac) {
    $formula_frac{$val->[0]}++;
  } 
  if (!%formula_frac) {
    warn "$file: No fractional coordinates\n";
    next FILE;
  }
  
  if (%formula) {
    my ($f1,$f2);
    foreach my $k (sort by_Hill keys %formula) {
      $f1 .= "$k$formula{$k} ";
    } 
    foreach my $k (sort by_Hill keys %formula_frac) {
      $f2 .= "$k$formula_frac{$k} ";
    } 
    if ($f1 ne $f2) {
      my $mult;
      foreach my $m (2..6) {
        (my $f = $f1) =~ s/($num)/$1*$m/ge;
        if ($f eq $f2) {
          $mult = $m;
          last;
        }
      }
      if ($mult) {
        warn "$file: Fractional coordinates contains $mult molecules \n";
      }
      else {
        warn "$file: Formulas are not coincided (${f1}in _chemical_formula_sum and ${f2}in frac. coord.)\n";
      }
    }
  }
  else {
    warn "$file: No _chemical_formula_sum\n";
  }
  
  if ($all_inside_cell) {
    foreach my $v (@frac) {
      foreach (@{$v}[1..3]) {
        $_ += 1 if $_<0;
        $_ -= 1 if $_>=1;
      } 
    }
  }
  
  if ($cell) {
    if ($cell =~ /^(\d+)$/) {
      $cell = "${1}x${1}x${1}";
    }
    # Pack unit cell
    my @cell_frac;
#    my @cell_frac = @frac;
    foreach my $op (@symmop) {
      #print "$op\n";
      #next if $op eq 'x,y,z';
      my @frac1 = symop($op, @frac);
      transpose_into_cell(@frac1);
      push @cell_frac, @frac1;
    }
    move_to_cell_center(@cell_frac) if ($move_to_cell_center && !$all_inside_cell);
    #print_frac(@cell_frac);
    @frac = @cell_frac;
    
    # Myltiply unit cell
    my ($a,$b,$c) = split 'x', $cell;
    my $n_cell_frac = @cell_frac;
    my @mult_frac = @frac;
    foreach my $m (mult_ar($a)) {
      foreach my $v (@frac) {
        push @mult_frac, [ $v->[0], $v->[1]+$m, $v->[2], $v->[3] ];
      }
    }
    @frac = @mult_frac;
    foreach my $m (mult_ar($b)) {
      foreach my $v (@frac) {
        push @mult_frac, [ $v->[0], $v->[1], $v->[2]+$m, $v->[3] ];
      }
    }
    @frac = @mult_frac;
    foreach my $m (mult_ar($c)) {
      foreach my $v (@frac) {
        push @mult_frac, [ $v->[0], $v->[1], $v->[2], $v->[3]+$m ];
      }
    }
    splice @mult_frac,0,$n_cell_frac if $exclude_initial;
    @frac = @mult_frac;
  }  
  
  if ($cell_corners) {
    push @frac, ['XX',0,0,0], ['XX',1,0,0], ['XX',0,1,0], ['XX',0,0,1], 
                ['XX',1,1,0], ['XX',1,0,1], ['XX',0,1,1], ['XX',1,1,1],
                ['XX',0.01,0,0], ['XX',0,0.01,0], ['XX',0,0,0.01],
                ['XX',-0.01,0,0], ['XX',0,-0.01,0], ['XX',0,0,-0.01];
  }
  
  my $xyz = frac2xyz(\@frac,$a,$b,$c,$alpha,$beta,$gamma);
  write_molden($xyz,"$name.xyz");
}

sub frac2xyz {
  my ($frac,$a,$b,$c,$alpha,$beta,$gamma)=@_;
  my ($sa,$sb,$sg) = map {sin($_/57.29578)} ($alpha,$beta,$gamma);
  my ($ca,$cb,$cg) = map {cos($_/57.29578)} ($alpha,$beta,$gamma);
  #print "$sa,$sb,$sg, $ca,$cb,$cg\n";
  my @xyz = ({});
  foreach (@$frac) {
    my ($at,$x,$y,$z) = @$_;
    my $X = $a*$x + $b*$cg*$y + $c*$cb*$z;
    my $Y = $b*$sg*$y + $c*($ca-$cb*$cg)/$sg*$z;
    my $Z = $c*sqrt(1-$ca**2-$cb**2-$cg**2+2*$ca*$cb*$cg)/$sg*$z;
    push @xyz, [$at,$X,$Y,$Z];
  }
  return \@xyz;  
}

sub write_molden {
  my $oldfh;
  
  if (@_ > 1 && !ref($_[-1])) {
    my $file = pop @_;
    open F, '>', $file or do {warn "Can't write to $file: $!\n"; return};
    $oldfh = select F;
  }
  
  foreach my $mol (@_) {
    my $N = $#{$mol};
    print " $N\n";
	  print " Energy $mol->[0]{Energy}" if $mol->[0]{Energy};
	  print "  Symmetry $mol->[0]{Symmetry}" if $mol->[0]{Symmetry};
	  print "\n";
	  for (my $i=1; $i<=$N; $i++) {
      printf " %-2s %12.8f %12.8f %12.8f\n", @{$mol->[$i]};
	  }
  }
  
  if ($oldfh) {
    close F;
    select $oldfh;
  }
}

sub move_to_cell_center {
  my ($Cx,$Cy,$Cz);
  foreach my $v (@_) {
    $Cx += $v->[1];
    $Cy += $v->[2];
    $Cz += $v->[3];
  }
  ($Cx,$Cy,$Cz) = map {$_/@_} ($Cx,$Cy,$Cz);
  foreach my $v (@_) {
    #printf "%2s%8.3f%8.3f%8.3f\n", @$v;
    $v->[1] -= ($Cx-0.5);
    $v->[2] -= ($Cy-0.5);
    $v->[3] -= ($Cz-0.5);
  }
  return 1;
}

sub print_frac {
  foreach my $v (@_) {
    printf "%2s%8.3f%8.3f%8.3f\n", @$v;
  }
  print "\n";
  return 1;
}

sub symop {
  my $op = shift;
  my @frac = @_;
  $op =~ s/([xyz])/\$$1/g;
  my @ops = split ',', $op;
  #print "@ops\n";
  my @frac1;
  foreach my $val (@frac) {
    my ($at,$x,$y,$z) = @$val;
    my ($x1,$y1,$z1);
    $x1 = eval "$ops[0]"; warn $@ if $@;
    $y1 = eval "$ops[1]"; warn $@ if $@;
    $z1 = eval "$ops[2]"; warn $@ if $@;
    #printf "%2s%8.3f%8.3f%8.3f\n", $at,$x1,$y1,$z1;
    push @frac1, [$at,$x1,$y1,$z1];
  }
  return @frac1;
}

sub center {
  my ($x,$y,$z);
  foreach my $v (@_) {
    $x += $v->[1];
    $y += $v->[2];
    $z += $v->[3];
  }
  return map {$_/@_} ($x,$y,$z);
}

sub transpose_into_cell {
  my @frac = @_;
  my ($cx,$cy,$cz) = center(@frac);
  #printf "%8.3f%8.3f%8.3f     ", $cx,$cy,$cz;
  my ($sx,$sy,$sz) = (0,0,0);
  $cx<0 and $sx = 1;
  $cy<0 and $sy = 1;
  $cz<0 and $sz = 1;
  $cx>1 and $sx = -1;
  $cy>1 and $sy = -1;
  $cz>1 and $sz = -1;
  #print "$sx,$sy,$sz\n";
  if ($sx != 0 || $sy != 0 || $sz != 0) {
    #print_frac(@frac);
    foreach (@frac) {
      @$_ = ($_->[0], $_->[1]+$sx,$_->[2]+$sy,$_->[3]+$sz);
    }
    #print_frac(@frac);
  }
  #printf "%8.3f%8.3f%8.3f\n", center(@frac);
  
}

sub mult_ar {
  my $n = shift;
  my @nnn = ();
  foreach (1..$n-1) {
    push @nnn, $_%2 ? $_/2+0.5 : -$_/2;
  }
  return @nnn;
}

sub by_Hill {
	return -1 if uc($a) eq 'C' and uc($b) ne 'C';
	return  1 if uc($a) ne 'C' and uc($b) eq 'C';
	return -1 if uc($a) eq 'H' and uc($b) ne 'H';
	return  1 if uc($a) ne 'H' and uc($b) eq 'H';
	uc($a) cmp uc($b);
}