=head1 NAME

iPE::Sequence - base class for biological sequences, including consensus sequences.

=head1 DESCRIPTION

Construction of these objects require an L<iPE::SequenceReader> object.  This provides major functionality for single sequences which can be read with a standard fasta file.  All objects which claim to be a sequence for estimation in iPE must have the functions below defined.

=head1 VARIABLES

=over 8

=cut

package iPE::Sequence;
use iPE;
use iPE::Util::DNATools;
use strict;


=item @alphabet

The letters of the sequence which represent a single letter.  getAlphabet () returns the array.

=cut
our @alphabet = ();

=item %ambigCodes

The ambiguity codes of the alphabet, represented as a hash of array references.  This should include the wildcard character.

=cut
our %ambigCodes = ();

=item $wildCard

The character which represents any letter in the alphabet.

=cut
our $wildCard = undef;

=back

=head1 CLASS METHODS

=over 8

=item getAlphabet (), getAmbigCodes (), getWildCard () 

Gets the above global variables as defined by the calling class.  getAlphabet and getAmbigCodes return references, whereas getWildCard returns a string containing the wildcard character.

=cut
{
    no strict 'refs';
    sub getAlphabet { \@{(ref($_[0]) || $_[0])."::alphabet"} }
    sub getAmbigCodes { \%{(ref($_[0]) || $_[0])."::ambigCodes"} }
    sub getWildCard { ${(ref($_[0]) || $_[0])."::wildCard"} }
}

# stores expanded ambiguity codes
our %expandCache = ();

=item expandAmbigSeq (sequence, exclusionList)

Expands an ambiguity sequence to all its possible sequences.  Pass in a reference to an array of ambiguity codes that you do not want resolved to ignore those ambiguity codes.  Returns an array of sequences which represent the expansion of the sequence.

=cut
sub expandAmbigSeq {
    use bytes;
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my ($seq, $exclList, $useWildCard, $start) = @_;

    $start = 0 if(!defined($start));
    $useWildCard = 0 if(!defined($useWildCard));

    die "Cannot call expandAmbigSeq on ".__PACKAGE__."\n"
        if $class eq __PACKAGE__;

    my $exclString = "";
    $exclString = join "", @$exclList if(defined($exclList));
    my $cacheString = $class."_".$seq."_".$exclString."_".$useWildCard;
    if(defined $expandCache{$cacheString}) {
        return @{$expandCache{$cacheString}};
    }

    my @results = ();
    my %ac = %{$class->getAmbigCodes()};

    if(defined $exclList) {
        for my $unusedCode (@$exclList) { $ac{$unusedCode} = undef; }
    }

    my $prefix = substr($seq, 0, $start);
    
    #scan for ambiguity codes
    for my $i ($start .. length($seq)-1) {
        my $c = substr($seq, $i, 1);
        #if one is found, recursively resolve each of the remaining codes
        my $letterRef = $ac{$c};
        if (defined($letterRef)) {
            my @letterList = @$letterRef;
            if($useWildCard && $c eq $class->getWildCard) {
                push @letterList, $class->getWildCard;
            }
            my $suffix = substr($seq, $i+1);
            for my $l (@letterList) {
                push @results, 
                    $class->expandAmbigSeq($prefix.$l.$suffix, $exclList, 
                        $useWildCard, $i+1);
            }
            last;
        }
        $prefix .= $c;
    }

    #if none is found, just return the original sequence
    if(scalar(@results)) { 
        $expandCache{$cacheString} = [ @results ];
        return @results; 
    }
    return $seq;
}

=item ambigSeqToRegExp (seq[, strictWildCard])

Converts any ambiguity sequence (including wildcard characters) into a regular expression and returns it.  If you pass 1 for strictWildCard, the wildCard character will only match sequences in the original alphabet.  Otherwise the wildcard character will match any character.

=cut
sub ambigSeqToRegExp {
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my ($seq, $strictWildCard) = @_;
    $strictWildCard = 0 if(!defined($strictWildCard));

    my $re = $seq;
    for my $code (keys %{$class->getAmbigCodes()}) {
        # build a character class for each ambiguity code.
        my $cclass;
        if($code eq $class->getWildCard() && !$strictWildCard) {
            $cclass = ".";
        }
        else {
            $cclass = "[";
            $cclass .= $_ for (@{$class->getAmbigCodes()->{$code}});
            $cclass .= "]";
        }
        $re =~ s/$code/$cclass/g;
    }

    return $re;
}

=item ambigSeqRegExp

Gives a regular expression that will detect if any ambiguity code exists in a string.

=cut
sub ambigSeqRegExp {
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my $re = "[".(join("", keys(%{$class->getAmbigCodes()})))."]";
    return $re;
}


=item generateWildCardSeq (length)

Generate a sequence of characters with only the wildcards included.

=cut
sub generateWildCardSeq {
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my ($length) = @_;
    die "Cannot call generateWildCardSequence on class ".__PACKAGE__."\n"
        if ($class eq __PACKAGE__);

    my $ambigStr = "";
    for (0 .. $length-1) { $ambigStr .= $class->getWildCard() }
    return $ambigStr;
}

=item getAllSequences (length[, ambiguate])

Returns all the sequences for a given length by expanding the wildcard sequence of the given length.  If the 'ambiguate' flag is set to 1, then the results will include the wildcard caracter of the alphabet.

=cut
sub getAllSequences {
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my ($length, $ambiguate) = @_;
    die "Cannot call generateWildCardSequence on class ".__PACKAGE__."\n"
        if ($class eq __PACKAGE__);
    $ambiguate = 0 if (!defined($ambiguate));

    return $class->expandAmbigSeq($class->generateWildCardSeq($length), [],
        $ambiguate);
}

=back

=head1 METHODS

=over 8

=item new ({sr => sequenceReader})

The constructor of a sequence takes a hash reference that is required to contain a SequenceReader object.  The SequenceReader object will not be advanced to the next sequence in the SequenceReader file after construction.

=cut

sub new {
    my ($class, $m) = @_;
    my $this = bless {}, $class;

    die "$class must be constructed with a SequenceReader object.\n"
        if (not defined $m->{sr});

    $this->{sr_}      = $m->{sr};
    $this->{numSeqs_} = $this->{sr_}->numSeqs;

    if($this->{sr_}->type eq "load") {
        $this->{seqRef_} = $this->{sr_}->seqRef;
        $this->{length_} = length(${$this->{seqRef_}});
        $this->{def_}    = $this->{sr_}->def;
        $this->{loaded_} = 1;
    }
    else {
        $this->{length_} = $this->{sr_}->length;
        $this->{loaded_} = 0;
    }

    return $this;
}

=item writeFeatureToFH (fh, feature), writeRegionToFH (fh, region)

This writes out the sequence for a feature to the filehandle.  By default, this assumes a simple sequence, but if a sequence needs extra coercing, you must override this method.

=cut

# note this does not revcomp coordinates.  this is to accomodate for the 
# difference between a region and a feature.
sub writeSubseqToFH {
    my ($this, $fh, $start, $end, $strand) = @_;
    my $length = $end-$start+1;

    if($this->loaded) {
        if($strand eq '-') {
            print $fh (substr(${$this->rcRef}, $start, $length)."\n");
        }
        else {
            print $fh (substr(${$this->seqRef}, $start, $length)."\n");
        }
    }
    else {
        for(my $i = 0; $i < $this->{numSeqs_}; $i++) {
            if($strand eq '-') {
                print $fh ($this->{sr_}->getRevSeq($start, $length, $i)."\n");
            }
            else {
                print $fh ($this->{sr_}->getSeq($start, $length, $i)."\n");
            }
        }
    }
}

sub writeFeatureToFH {
    my ($this, $fh, $feature) = @_;

    my ($start, $end) = ($feature->start, $feature->end);
    if($feature->strand eq '-') {
          ($start, $end) = 
              rcCoords($feature->start, $feature->end, $this->length);
    }
    $this->writeSubseqToFH($fh, $start, $end, $feature->strand);
}

sub writeRegionToFH { 
    my ($this, $fh, $region) = @_;
    $this->writeSubseqToFH($fh, $region->start, $region->end, $region->strand);
}

=item length ()

This returns the length of the sequence.  This is passed on from the sequence reader.

=cut
sub length { shift->{length_}   }

=item def ()

Return a reference to the definition string of this sequence.

=cut
sub def { shift->{def_} }

=item loaded ()

Indicates whether the sequence is loaded into memory.  If 1, the seqRef and rcRef strings should be valid, if not, you may access the sequences via the getContext function.

=cut
sub loaded { shift->{loaded_} }

=item seqRef ()

Return a *reference* to the sequence string.  The sequence string is a reference to prevent unwanted copying of long sequences.  In the case where this is a complex datatype and not just a string, this should be overridden to return an object reference.

=cut
sub seqRef { shift->{seqRef_} }

=item rcRef ()

Return a reference to the reverse complement (or just reverse) of the sequence.  Note that this is not implemented in the base class, and must be overridden in all subclasses.

=cut
sub rcRef  { 
    my $this = shift;
    die "Reverse Complement (rcRef) subroutine not defined in ".ref($this)."\n";
}

=item seqReader ()

In the case that the sequence has not been loaded, you may use the sequence reader to retrieve parts of the sequence needed.

=cut
sub seqReader { shift->{sr_} }

=item type ()

All sequences should have a brief tag for the english description of the type of sequence for matching up with the parameter template file.  For example, plain DNA sequence is "DNA".  This is not defined in the base class, and all Sequence subclasses must define this.

=cut
sub type { 
    my $this = shift;
    die "Sequence class ".ref($this)."does not have the type method defined\n";
}

=item getContext (pos, strand, context[, targetContext])

Gets the context encoded into a single space-delimited (by position) string.  Optionally pass a target context if you want a different context size for the target.  The missing letters will be replaced with "X"s.  Here is an example which illustrates the encoding:

Alignment:
T: AT
I1:CG
I2:GC

Encoding of getContext(1,"+",1):
ACG TGC

Encoding of getContext(1,"+",0,1):
AXX TGC

=cut
sub getContext {
    my ($this, $strand, $base, $order, $targetOrder) = @_;
    
    $targetOrder = $order if !defined($targetOrder);
    my $targetStart = $base-$targetOrder;
    my $start = $base-$order;
    my $length = $targetOrder > $order ? $targetOrder+1 : $order+1;

    #handle the target context first
    my @subseqs;
    my $prefix = "";
    my $diff = $targetStart-$start;
    my $seq;
    for (; $diff > 0; $diff--) { $prefix .= "X" }
    if($strand eq '+') {
        $seq = $this->{sr_}->getSeq($targetStart, $targetOrder+1, 0);
    }
    else {
        $seq = $this->{sr_}->getRevSeq($targetStart, $targetOrder+1, 0)
    }

    push @subseqs, $prefix.$seq;

    $prefix = "";
    $diff = $start-$targetStart;
    for(; $diff > 0; $diff--) { $prefix .= "X" }
    for(my $i = 1; $i < $this->{numSeqs_}; $i++) {
        if($strand eq '+') {
            $seq = $this->{sr_}->getSeq($start, $order+1, $i);
        }
        else {
            $seq = $this->{sr_}->getRevSeq($start, $order+1, $i);
        }
        push @subseqs, $prefix.$seq;
    }
    
    my $context = "";
    for(my $i = 0; $i < $length; $i++) {
        for(my $j = 0; $j < $this->{numSeqs_}; $j++) {
            $context .= substr($subseqs[$j], $i, 1);
        }
        $context .= " " if($i < $length-1 && $this->{numSeqs_} > 1);
    }

    return $context;
}


#sub _getContextLoaded {
    #my ($this, $strand, $strref, $start, $offset) = @_;
    #return substr($$strref, $start, $offset);
    ##if($strand eq "+") {
        #return substr($this->{strRef_}, $start, $offset);
    #}
    #else {
        #return substr($this->{rcRef_}, $start, $offset);
    #}
#}

#sub getContext { shift->{getContext_} }
    #my ($this, $strand, $start, $offset) = @_;

    #if($this->{loaded_}) {
        #if($strand eq "+") {
            #return substr($this->{strRef_}, $start, $offset);
        #}
        #else {
            #return substr($this->{rcRef_}, $start, $offset);
        #}
    #}
    #else {
    #}
#}

=back

=head1 SEE ALSO

L<iPE::SequenceReader>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu).

=cut

1;

__DATA__
/* code is unused and never verified */

void _expandAmbigSeqHelper(char *seq, HV *ac_hvp, int start, int len, int *n_results, int *alloced_results, char ***results);

void _expandAmbigSeq(AV *results_avp, char *seq, HV *ac_hvp, AV *excl_avp) 
{
    int i, n_results, alloced_results, let_len;
    int len;
    char ** results;
    SV **letter_svpp, *letter_svp;
    char * key;

    len = strlen(seq);
    n_results = 1;
    let_len = 1;

    results = malloc(sizeof(char **) * 1);
    *results = malloc(sizeof(char *) * 5);
    alloced_results = 5;

    /* remove all the keys from the exclusion list */


    for(i = 0; i < av_len(excl_avp); i++) {
        letter_svpp = av_fetch(excl_avp, i, 0);
        letter_svp = *letter_svpp;
        key = SvPV(letter_svp, let_len);
        if(hv_exists(ac_hvp, key, let_len)) {
            hv_delete(ac_hvp, key, let_len, 0);
        }
    }

    /* do the actual expansion */
    _expandAmbigSeqHelper(seq, ac_hvp, 0, len, &n_results, &alloced_results, &results);
    results_avp = newAV();

    /* construct the array to expand */
    for(i = 0; i < n_results; i++) {
        av_push(results_avp, newSVpv(results[i], len));
    }
}

void _expandAmbigSeqHelper(char *seq, HV *ac_hvp, int start, int len, int *n_results, int *alloced_results, char ***results)
{
    int i, j, k, let_len;
    SV ** codearr_svpp;
    AV *  codearr_avp;
    SV *  code_sv;
    char *code;
    char *subseq;

    subseq = malloc(sizeof(char) * len);
    let_len = 1;

    for(i = start; i < len; i++) {
        if(hv_exists(ac_hvp, &seq[i], let_len)) {
            _expandAmbigSeqHelper(&seq[i], ac_hvp, len-i-1, len,
                n_results, alloced_results, results);
            codearr_svpp = hv_fetch(ac_hvp, &seq[i], 1, 0);
            codearr_avp  = SvRV(*codearr_svpp);
            for(j = 0; j < av_len(codearr_avp); j++) {
                code_sv = av_fetch(codearr_avp, j, 0);
                code = *((char *)SvPV(code_sv, let_len));
                warn(code);
                for(k = 0; k < *n_results; k++) {
                    strncpy(subseq, seq, i);
                    strncpy(subseq+i+1, code, 1);
                    /* 
                     * TODO: optimization: rather than recursing, 
                     * just tack on to end
                     */
                    strcpy(subseq+i+2, seq+i+2);
                    _expandAmbigSeqHelper(subseq, ac_hvp, i+1, len, 
                        n_results, alloced_results, results);
                    return;
                }
            }
        }
    }

    if(*n_results == *alloced_results) {
        *alloced_results *= 2;
        *results = realloc(*results, sizeof(char *) * *alloced_results);
        if(*results == NULL) die("Out of memory when expanding ambig seq.\n");
        (*results)[*n_results] = malloc(sizeof(char *)*len);
        strcpy((*results)[*n_results], seq);
        (*n_results)++;
    }
}


