#!/usr/bin/perl
#
# Author: Josh Odom
#
# Version: 0.01 (alpha)
#
# Description:
# This program is designed to provide query-like functionality to delimited
# text-files.  This was created in response to the author's perceived
# lack of ability to find and replace named columns in text files, even though
# other column-based utilities exist (like cut, paste), and other search
# and replace utilities exist (like grep and sed).
#
# Please note that this utility is currently in an alpha state, and care should
# be taken when using it with sensitive data.  
#
#
# Revisions:
# 0.01 - Febuary 6, 2009 - Initial release
#
# Known Bugs and Missing Functionality:
# - no checks are made to determine that the requested field exists
# - the functionality is undefined for using fields which do not exist
#   in the file
# - zero and negative fields do not have defined functionality
# - columns cannot be selected from a description on a header row (i.e.
#   specifying a column name instead of field number)
# - usage is not displayed on error
# - code is not thoroughly documented
# - "and"s strictly have higher precedence than "or"s, and there is no way
#   to force precedence
# - this utility reads from stdin and writes to stdout, and there's no way
#   to specify an input or output file
#
# Other Notes:
# - code is not thoroughly documented
# - requires perl 5


use strict;
use Switch;


## Global variables
my $delimiter;
my $type;
my $root_condition;
my @assignment_criteria = ();

my $line;
my @tokens;
my $temp;


## Fake enumeration
my $CONDITIONGROUP = 1;
my $ORGROUP = 2;
my $ANDGROUP = 3;
my $CONDITION = 4;
my $FIELD = 7;
my $CONSTANT = 8;

my $SELECT = 11;
my $UPDATE = 12;
my $DELETE = 13;
my $SET = 14;

my $EQUAL = 21;
my $NOTEQUAL = 22;
my $LESSTHAN = 23;
my $GREATERTHAN = 24;
my $LESSEQUAL = 25;
my $GREATEREQUAL = 26;


my $VERSION = "0.01";
my $USAGE = "usage: filesql.pl -d{delim} select where {CONDITIONGROUP}\n"
          . "usage: filesql.pl -d{delim} update set -f{number} = {TERMINAL} \\\n"
          . "       [-f{number} = {TERMINAL} ...] [where {CONDITIONGROUP}]\n"
          . "usage: filesql.pl -d{delim} delete where {CONDITIONGROUP}\n\n"
          . "{CONDITIONGROUP} := {ANDGROUP} [or {ANDGROUP} ...]\n"
          . "{ANDGROUP}       := {CONDITION} [and {CONDITION} ...]\n"
          . "{CONDITION}      := {TERMINAL} {OPERATOR} {TERMINAL}\n"
          . "{TERMINAL}       := -f{number} | {constant}\n"
          . "{OPERATOR}       := -eq | -ne | -gt | -lt | -ge | -le\n";



## read arguments off command line
$temp = shift(@ARGV);


if ($temp eq "-v" || $temp eq "--version") {
  print $VERSION . "\n";
  exit 0;
}

if ($temp eq "-h" || $temp eq "--help") {
  print $USAGE;
  exit 0;
}


## Expects the first argument to be the delimiter, or dies
substr($temp, 0, 2) eq "-d" || die "Bad delimiter\n";
length $temp == 3 || die "Bad delimiter\n";

$delimiter = substr($temp, 2, 1);


## Gets the type of selection
$temp = shift(@ARGV);

switch ($temp) {
  case "select" {
    $type = $SELECT;
  }
  case "update" {
    $type = $UPDATE;
  }
  case "delete" {
    $type = $DELETE;
  }
  else {
    die "Bad selection type\n";
  }
}


## Read the "set" statement, if "update"
if ($type == $UPDATE) {
  $temp = shift(@ARGV);
  $temp eq "set" || die "Missing 'set' statement\n";
  
  while (defined @ARGV[0] && @ARGV[0] ne "where") {
    substr(@ARGV[0], 0, 2) eq "-f" || die "Bad field\n";
    length @ARGV[0] >= 3 || die "Bad field\n";
    @ARGV[1] eq "=" || die "Missing '=' in set statement]n";
    defined @ARGV[0] || die "Missing assignment value\n";
    
    my $temp_criterion = [substr(@ARGV[0], 2)-1, &eval_terminal(@ARGV[2])];
    
    push(@assignment_criteria, $temp_criterion);
    
    shift(@ARGV);
    shift(@ARGV);
    shift(@ARGV);
  }
}


## Read condition
$temp = shift(@ARGV);


if ($temp eq "where") {
  $root_condition = [$CONDITIONGROUP, &eval_or];
} else {
  $root_condition = [$CONDITIONGROUP];
}



## Process each line from stdin based on criteria read from command line
while ( <STDIN> )
{
  $line = $_;
  
  ## It is important that we strip the newline off before the split and add
  ## it back after the join, in the case that the final field gets updated or
  ## new fields get added
  
  chomp($line);
  @tokens = split /[$delimiter]/, $line;
  
  ## Evaluate this line against given conditions
  my $evaluation = &evaluate($root_condition);
  
  switch ($type) {
    case ($SELECT) {
      if ($evaluation) {
        print join($delimiter,@tokens) . "\n";
      }
    }
    case ($DELETE) {
      if ( ! $evaluation) {
        print join($delimiter,@tokens) . "\n";
      }
    }
    case ($UPDATE) {
      if ($evaluation) {
        my @out_tokens = @tokens;
        foreach (@assignment_criteria) {
          my @criterion = @{$_};
          $out_tokens[@criterion[0]] = &evaluate(@criterion[1]);
        }
        print join($delimiter,@out_tokens) . "\n";
      } else {
        print join($delimiter,@tokens) . "\n";
      }
    }
  }
}


## Evaluates an array where the first element is its optype, and the rest
## of the elements are its data.
## The criteria specified on the command line is stored as a tree of nested
## lists in this format.
sub evaluate {
  my @args = @{@_[0]};
  switch (@args[0]) {
    case ($CONDITIONGROUP) {
      if (defined @args[1]) {
        return &evaluate(@args[1]);
      } else {
        return 1;
      }
    }
    case ($ORGROUP) {
      if ( &evaluate(@args[1]) ) {
        return 1;
      } elsif ( defined @args[2] ) {
        return &evaluate(@args[2]);
      } else {
        return 0;
      }
    }
    case ($ANDGROUP) {
      if ( ! &evaluate(@args[1]) ) {
        return 0;
      } elsif ( defined @args[2] ) {
        return &evaluate(@args[2]);
      } else {
        return 1;
      }
    }
    case ($CONDITION) {
      switch (@args[1]) {
        case ($EQUAL) {
          return &evaluate(@args[2]) eq &evaluate(@args[3]);
        }
        case ($NOTEQUAL) {
          return &evaluate(@args[2]) ne &evaluate(@args[3]);
        }
        case ($LESSTHAN) {
          return &evaluate(@args[2]) < &evaluate(@args[3]);
        }
        case ($GREATERTHAN) {
          return &evaluate(@args[2]) > &evaluate(@args[3]);
        }
        case ($LESSEQUAL) {
          return &evaluate(@args[2]) <= &evaluate(@args[3]);
        }
        case ($GREATEREQUAL) {
          return &evaluate(@args[2]) >= &evaluate(@args[3]);
        }
      }
    }
    case ($FIELD) {
      return @tokens[@args[1]];
    }
    case ($CONSTANT) {
      return @args[1];
    }
  }
}


## Processes a "terminal" on the command line, which is either a field
## or a constant
sub eval_terminal {
  my $temp = @_[0];
  if (substr($temp, 0, 2) eq "-f") {
    my $temp_field = (substr($temp, 2) - 1);
    return [$FIELD, $temp_field];
  } elsif (substr($temp, 0, 2) eq "--") {
    return [$CONSTANT, substr($temp, 1)];
  } else {
    return [$CONSTANT, $temp];
  }
}


## Processes a group of statements joined by "or"
sub eval_or {
  my $andgroup = &eval_and;
  if (! defined @ARGV[0]) {
    return [$ORGROUP, $andgroup];
  } else {
    $temp = shift(@ARGV);
    $temp eq "or" || die "Expected 'or'\n";
    
    my $orgroup = &eval_or;
    return [$ORGROUP, $andgroup, $orgroup];
  }
}


## Processes a group of statements joines by "and"
sub eval_and {
  my $condition = &eval_condition;
  if (! defined @ARGV[0] || @ARGV[0] ne "and") {
    return [$ANDGROUP, $condition];
  } else {
    $temp = shift(@ARGV);
    $temp eq "and" || die "Expected 'and'\n"; ## redundant
    
    my $andgroup = &eval_and;
    return [$ANDGROUP, $condition, $andgroup];
  }
}


## Processes an atomic condition
sub eval_condition {
  defined @ARGV[0] || die "Expected criteria\n";
  defined @ARGV[1] || die "Expected criteria\n";
  defined @ARGV[2] || die "Expected criteria\n";
  
  my $terminal1 = &eval_terminal(@ARGV[0]);
  my $op;
  my $terminal2 = &eval_terminal(@ARGV[2]);
  
  switch (@ARGV[1]) {
    case "-eq" {
      $op = $EQUAL;
    }
    case "-ne" {
      $op = $NOTEQUAL;
    }
    case "-lt" {
      $op = $LESSTHAN;
    }
    case "-gt" {
      $op = $GREATERTHAN;
    }
    case "-le" {
      $op = $LESSEQUAL;
    }
    case "-ge" {
      $op = $GREATEREQUAL;
    }
    else {
      die "Invalid operator\n";
    }
  }
  
  shift(@ARGV);
  shift(@ARGV);
  shift(@ARGV);
  
  return [$CONDITION, $op, $terminal1, $terminal2];
}

