#!/usr/bin/perl -ws

use strict;

our ($h,$help,$n,$debug,$re);

if ($h || $help) {
  (my $program = $0) =~ s/^.*[\/\\]//;
  print "Usage: $program [-n=N] file.out

Filter out-file or stdout and fold text blocks containing similar lines.
This program is useful for analysis of huge quantum chemical outputs.

Regexp is generated for each line and is compared with the regexp of previous line.

Options:
-debug
-n=10  Doesn't fold text blocks less than 10 lines.
-re=reqexp  add user's regexps. Regexp must not contain literal spaces 
       (use \\s instead). Several regexps must be delimited with spaces.
       F.e. -re='[\\d\\s]+([\\s-]?\\d+\\.\\d+E[+-]\\d+)+' for GAMESS formated groups.
"
;  exit;
}

$n ||= 10;
$re ||= '';

my @re = split ' ', $re;

my $prev_regexp = '';
my @arr;

while (<>) {
  my $line = $_;
  my $regexp = do_regexp($line, @re);
  warn "line: $., prev: $prev_regexp, cur: $regexp\n" if $debug;
  
  #if ($#arr < 0) {
    #push @arr, $line;
    #$prev_regexp = $regexp;
    #next;
  #}
  if ($regexp ne $prev_regexp) {
    if (@arr > $n-1) {
      print $arr[0];
      print "############## and $#arr similar lines ##############\n";
    } else {
      print @arr;
    }
    @arr = ();
    $prev_regexp = $regexp;
  }
  push @arr, $line;

}
if (@arr > $n-1) {
  print $arr[0];
  print "... and yet $#arr similar lines ...\n";
} else {
  print @arr;
}

sub do_regexp {
	my $sample = shift;
	my $regexp;
	no warnings 'qw';
	my @pattern = (
	'-?\d+(\.\d+)?[eE]-?\d+', # floating-point number in scientific notation
	'-?\d+\.\d+',   # floating-point number in fixed decimal notation
	'-?\d+',        # signed integer
	'\w+',          # any word (alphanumeric and _ characters)
	'[[:punct:]]+', # ASCII non-controls, non-alphanumeric, non-space characters
  '\s+',          # space characters
	'\S+',          # any non-space characters
	);
  @pattern = (@_, @pattern) if @_;

	LOOP:
	{
  	foreach $a (@pattern) {
    	$regexp .= $a, redo LOOP if $sample =~ /\G$a/gc;
  	}
	}
	
	return $regexp;
}