// -*- C++ -*------------------------------------------------------------------
//  $Id: 
//
//  Class:              SequenceAlignment
//
//  Base Class(es):     -
//
//  Derived Class(es):  SimpleSequenceAlignment
//
//  Authors:            Eckart Bindewald
//
//  Description:
//    This class is an interface for a simple alignment class.
//
// ---------------------------------------------------------------------------

#ifndef _SEQUENCE_ALIGNMENT_H_
#define _SEQUENCE_ALIGNMENT_H_

// Includes:
#include <iostream>
#include <string>
#include <Vec.h>
#include <map>

using namespace std;

const char GAP_CHAR = '-';
const char GAP_ALT = '.';
const char FASTA_CHAR = '>';
const string GAP_CHARS = "-."; // both possibilities

class SequenceAlignment {

 public:

  /** TYPEDEFS */

  typedef unsigned int size_type;
  typedef string sequence_type;
  typedef string::size_type sequence_size_type;
  typedef Vec<string> sequence_container;
  typedef Vec<string> name_container;
  typedef map<string, string> properties_type; // additional properties of each sequence

  virtual ~SequenceAlignment() { }

  virtual void addSequence(const sequence_type& s,
			   const string& name) = 0;

  virtual void addSequence(const sequence_type& s,
			   const string& name,
			   const properties_type& seqProperties) = 0;

  /** removes all sequence data. */
  virtual void clear() = 0;

  /** removes sequences with rank greater n */
  virtual void clipAfter(size_type n) = 0;

  /** Removes all columns that correspond to a gap in sequence with specified number */
  virtual void collapse(size_type seqId) = 0;

  /** Compress internal state of  sequence alignment.
   *  All methods should still work, albeit slower */
  virtual void compress() = 0;

  /** counts number of characters and gaps. Careful: characters "X" and "N" are counted as characters. */
  virtual void countCharacters(sequence_size_type& numChars, sequence_size_type& numGaps) const = 0;

  /** counts number of characters and gaps of n'th sequence. Careful: characters "X" and "N" are counted as characters. */
  virtual void countCharacters(sequence_size_type& numChars, sequence_size_type& numGaps, size_type n) const = 0;

  /** counts number of characters of type c */
  virtual size_type countCharacter(char c) const = 0;

  /** returns number of dinucleotides defined by two characters c1 and c2 */
  virtual size_type countDiCharacter(char c1, char c2) const = 0;

  /** removes n'th column */
  virtual void deleteColumn(sequence_size_type n) = 0;

  /** removes gap columns */
  virtual void deleteGapColumns() = 0;

  /** removes all gaps from each sequence. */
  virtual void deleteGaps() = 0;

  /** returns index of sequence with specified name. Careful:
   * change from find algorithm to compare !
   */
  virtual size_type findSequenceByName(const string& name) const = 0;

  /** Finds sequence id, for which key-value pair of properties is fullfilled. If not found returns number of sequences. */
  virtual size_type findSequenceByProperty(const string& key, const string& value) const = 0;

  /** returns n'th column */
  virtual sequence_type getColumn(sequence_size_type n) const = 0;

  /** returns number of characters */
  virtual sequence_size_type getLength() const = 0;

  virtual const string& getName(size_type n) const = 0;

  virtual double getScore() const = 0;

  virtual void setScore(double score) = 0;

  virtual sequence_size_type getReferenceSequenceStartCol() const = 0;

  virtual void setReferenceSequenceStartCol(sequence_size_type n) = 0;

  virtual size_type getReferenceSequenceId() const = 0;

  virtual void setReferenceSequenceId(size_type n) = 0;

  virtual const name_container& getNames() const = 0;

  virtual sequence_type getSequence(size_type n) const = 0;

  virtual const sequence_container& getSequences() const = 0;

  virtual const properties_type& getSequenceProperties(size_type n) const = 0;

  virtual string getSequenceProperty(size_type n, const string& key) const = 0;

  virtual void setSequenceProperty(size_type n, const string& key, const string& value) = 0;

  /** returns true if all sequences have same length. */
  virtual bool hasEvenLengths() const = 0;

  /** inserts gap at spcified position */
  virtual void insertGap(sequence_size_type pos) = 0;

  /** Returns true if sequence alignment is internally compressed. */
  virtual bool isCompressed() const = 0;

  /** returns true if there is a gap at specified position for all sequences. */
  virtual bool isGapColumn(sequence_size_type pos) const = 0;

  /** converts '.' charachter to '-' */
  virtual void normalizeGap() = 0;

  /** Returns vector of size() elements, each element (string) containing the value corresponding to this sequence and the specified key.
   * If not property was found, use empty string.
   */
  virtual Vec<string> propertyValues(const string& key) const = 0;

  /** leaves only fragment starting from position start and with specified length. */
  virtual void prune(sequence_size_type start, sequence_size_type length) = 0;

  /** shuffles sequences horizontally */
  virtual void randomShuffle() = 0;

  /** shuffles sequences vertically */
  virtual void randomShuffleColumns(bool keepFirst) = 0; 

  /** shuffles string such that gap pattern is preserved */
  virtual void randomShuffleColumnsGapPreserving(bool keepFirstFixed) = 0;

  /** reads FASTA formatted file */
  virtual void readFasta(istream& is) = 0;

  /** Brute-force way to remove all internally stored property objects */
  virtual void removeAllProperties() = 0;

  virtual void removeSequence(size_type n) = 0;

  /** exchange two characters in sequences */
  virtual void replace(char cOld, char cNew) = 0;

  /** sets a column of specified position. */
  virtual void setColumn(const string& col, sequence_size_type pos) = 0;

  virtual void setName(const string& name,
		        size_type n) = 0;

  virtual void setSequence(const sequence_type& s, 
			   const string& name,
			   size_type n) = 0;

  virtual void setSequence(const sequence_type& s, 
			   size_type n) = 0;


  /** returns number of sequences */
  virtual size_type size() const = 0;

  /** swaps sequences and names of ids n1 and n2 */
  virtual void swapSequences(size_type n1, size_type n2) = 0;

  /** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
  virtual void transformReverseComplement() = 0;

  /** Uncompress internal state of  sequence alignment.
   *  All methods should still work, albeit slower */
  virtual void uncompress() = 0;

  /** converts sequence characters to upper case */
  virtual void upperCaseSequences() = 0;

  /** Return true if object is well defined and self-consistent. Returns false for empty alignments. */
  virtual bool validate() const = 0;

  /** writes FASTA formatted file. */
  virtual void writeFasta(ostream& os) const = 0;

  /** writes defined properties */
  virtual void writeProperties(ostream& os) const = 0;

  /* STATIC */

  /** returns true if character is gap character */
  static bool isGap(char c) {
    return (c == GAP_CHAR) || (c == GAP_ALT);
  }

  /** Counts the number of gap characters in a string segment */
  static sequence_size_type countGaps(const string::const_iterator& start, const string::const_iterator stop) {
    sequence_size_type count = 0;
    for (string::const_iterator it = start; it != stop; it++) {
      if (isGap(*it)) {
	++count;
      }
    }
    return count;
  }

  static sequence_size_type countIdentical(const string& s1, const string& s2, const string& ignore) {
    PRECOND(s1.size() == s2.size());
    sequence_size_type n = min(s1.size(), s2.size());
    sequence_size_type count = 0;
    for (sequence_size_type i = 0; i < n; ++i) {
      if ((s1[i] == s2[i]) && (!isGap(s1[i])) && (ignore.find(s1[i]) >= ignore.size())) {
	++count;
      }
    }
    return count;
  }

  /** counts number of identical non-gap residues, supply ignore string like "XN" to ignore certain residues. */
  static sequence_size_type countSuperposed(const string& s1, const string& s2, const string& ignore) {
    PRECOND(s1.size() == s2.size());
    sequence_size_type n = s1.size();
    sequence_size_type count = 0;
    for (sequence_size_type i = 0; i < n; ++i) {
      if ((!isGap(s1[i])) && (ignore.find(s1[i]) >= ignore.size())) {
	++count;
      }
    }
    return count;
  }

    /** Returns distance matrix. 
	write warning if sequences are more dissimilar than
	cutoff 
    */
    static Vec<Vec<double> >  getSeqDistances(const SequenceAlignment& ali, const string& ignore) {
      Vec<double> row(ali.size(), 0.0);
      Vec<Vec<double> > field(ali.size(), row);
      for (unsigned int i = 1; i < ali.size(); ++i) {
	for (unsigned int j = 0; j < i; ++j) {
	  unsigned int numIdent = SequenceAlignment::countIdentical(ali.getSequence(i),
								    ali.getSequence(j), ignore);
	  unsigned int numSup = SequenceAlignment::countSuperposed(ali.getSequence(i),
								   ali.getSequence(j), ignore);
	  // unsigned int numSup = ali.getLength(); // divide by length
	  //        cout << "# identical residues between " << i << " " << j 
	  //  	   << " : " << numIdent << endl;
	  if (numSup > 0) {
	      double w = 1 - (static_cast<double>(numIdent) 
			      / static_cast<double>(numSup));
	      //  	if (w > cutoff ) {
	      //  	  cerr << "Sequences " << i + 1 << " and " 
	      //  	       << j+1 << " have greater distance than cutoff: " 
	      //  	       << 100.0 * (1.0 - cutoff) << " " << 100.0 * (1.0 - w) << endl;
	      //  	  cerr << ali.getTemplate(i) << endl;
	      //  	  cerr << ali.getTemplate(j) << endl;
	      //  	}
	      field[i][j] = w;
	      field[j][i] = field[i][j];
	  }
	}    
      }
      return field;
    }

};

/** Default output is FASTA format */
inline
ostream&
operator << (ostream& os, const SequenceAlignment& ali) {
  ali.writeFasta(os);
  return os;
}

inline
bool operator < (const SequenceAlignment& left, const SequenceAlignment& right) {
  return (left.getReferenceSequenceStartCol() < right.getReferenceSequenceStartCol());
}

inline
bool operator <= (const SequenceAlignment& left, const SequenceAlignment& right) {
  return (left.getReferenceSequenceStartCol() <= right.getReferenceSequenceStartCol());
}

inline
bool operator > (const SequenceAlignment& left, const SequenceAlignment& right) {
  return (left.getReferenceSequenceStartCol() > right.getReferenceSequenceStartCol());
}

inline
bool operator >= (const SequenceAlignment& left, const SequenceAlignment& right) {
  return (left.getReferenceSequenceStartCol() >= right.getReferenceSequenceStartCol());
}

#endif
