// -*- C++ -*------------------------------------------------------------------
//  $Id: Alignment.h,v 1.16 2005/09/19 16:41:10 bindewae Exp $
//
//  Class:              Alignment
//
//  Description:
//    This class implements a simple alignment type.
//
// ---------------------------------------------------------------------------

#ifndef _ALIGNMENT_H_
#define _ALIGNMENT_H_

// Includes:
#include <Vec.h>
#include <string>
#include <debug.h>
#include <StringTools.h>

class Alignment
{
public: 

  enum {GAP_CHAR = '-', GAP_ALTERNATIVE = '.' };

// CONSTRUCTORS/DESTRUCTOR:
  Alignment();
  Alignment(const Alignment& orig); 
  virtual ~Alignment();

  /* OPERATORS */

  /** Assigment operator. */
  Alignment& operator = (const Alignment& orig);

  /* PREDICATES */
  string getTarget() const;
  string getTargetName() const;
  const string& getTemplate(unsigned int index = 0) const;
  string getTemplateName(unsigned int index = 0) const;
  const Vec<string>& getTemplates() const { return seqTemplate; }
  double getScore(unsigned int index = 0) const;
  double getEvalue(unsigned int index = 0) const;

  bool isConserved(unsigned int pos, unsigned int index) const;
  /* return number between 0.0 and 1.0 (completely conserved) */
  // double computeConservation(unsigned int pos) const;
  /* return number between 0.0 and 1.0 (completely conserved) */
  // double computeEntropy(unsigned int pos) const;
  bool isInsertion(unsigned int pos, unsigned int index = 0) const;
  bool isDeletion(unsigned int pos) const;
  bool isGap(unsigned int pos, unsigned int index = 0) const;
  static bool isGapLetter(char c) { return (c == GAP_CHAR) || (c == GAP_ALTERNATIVE); }
  /** returns true if column consists only of gap characters */
  bool isGapColumn(unsigned int p) const;
  /* return nunmber between 1.0 and 0.0 for residue with position pos */
  // double conservationWeight(unsigned int pos) const;
  /** return number of sequences */
  unsigned int size() const;
  /** return length of alignment */
  unsigned int getLength() const;
  double calculateIdentity() const; 
  
  char getTargetPos(unsigned int p) const;
  char getTemplatePos(unsigned int p, unsigned int index = 0) const;

  /** offset for sequence counting. Only needed for alignment. Default: 0 */
  unsigned int getTargetAminoAcidOffset() const { return startAaTarget; }
  /** return offset of n'th template (counting from zero) */
  int getTemplateAminoAcidOffset(unsigned int index = 0) const;
  void saveFasta(ostream& output) const;
  /* save as Clustal like output. Can be used as input for BLAST */
  void saveClustal(ostream& output) const;
  /* save as Clustal like output. Can be used as input for BLAST */
  void saveMSAF(ostream& output) const;
  /* save as MY Danforth sequence alignment format */
  void saveDSAF(ostream& output) const;
  /** saves as typical printable output */
  void savePrintable(ostream& os) const;
  /** return number of superposed amino acids for target and template n */
  unsigned int numSuperposed(unsigned int n) const;
  /** return number of superposed amino acids for target and template n */
  unsigned int numSuperposed(unsigned int n, unsigned int m) const;
  /** return EC nunmber of template i */
  string getEC(unsigned int i) const;
  /** return "slice" of profile at position pos  */
  Vec<char> getSlice(unsigned int pos) const;
  string getSliceString(unsigned int pos) const;
  /** return index of sequence with lowest sequence identity compared to orig */
  // unsigned int findWorstSequence(unsigned int orig) const;

  /** returns index of template with name containing s */
  unsigned int findTemplateByName(const string& s) const;

  /** returns true if a least one sequence is defined */
  bool isValid() const { return size() > 0; }

  /** return true if alignment makes sense at all (same lengths of sequences etc) */
  bool isConsistent() const;
  
  /** returns true if all sequences of both alignments are equal */
  bool isEqual(const Alignment& other) const;

  /** counts number of characters and gaps */
  void countCharacters(unsigned int& numChars, unsigned int& numGaps) const;

  /** counts number of occurence of certain character */
  int countCharacter(char c) const;

  const Vec<double>& getWeights() const { return seqWeights; }

  /* MODIFIERS */

  void clearAlignment();
  void setTarget(string t, string tName = "target");
  void setTemplate(string t, string tName = "template", double tScore = 0.0, 
		   double tEvalue = -1.0);
  /** sets n'th sequence */
  void setTemplate(string t, unsigned int n) {
    PRECOND(n < seqTemplate.size(), exception);
    seqTemplate[n] = t;
  }
  /** sets name of n'th sequence */
  void setTemplateName(string t, unsigned int n) {
    PRECOND(n < seqTemplateName.size(), exception);
    seqTemplateName[n] = t;
  }

  void addTemplate(const string& seq, const string& name);

  void setScore(double val, unsigned int index = 0);
  void setEvalue(long double val, unsigned int index = 0);
  /** removes all templates */
  void clearTemplate();
  /** removes all info about n'th template */
  void clearTemplate(unsigned int n);
  /** removes all templates below index */
  void cutTemplate(unsigned int index); 

  /** offset for sequence counting. Only needed for alignment. Default: 0 */
  void setTargetAminoAcidOffset(int orig);
  /** return offset of n'th template (counting from zero) */
  void setTemplateAminoAcidOffset(unsigned int index, int val);
  /** read FASTA format input */
  void loadFasta(istream& input);
  /** read MASE format input (see program Seaview) */
  void loadMase(istream& input);
  /** read Danforth format input */
  void loadDanforth(istream& input);
  /** read output of CE program: */
  void loadCEBody(istream& input);
  /** read output of CE program: */
  void loadCE(istream& input);

/** read BLAST output produced with blast option -m 6 
    this stands for a convenient form of multiple sequence alignment  */
  void loadBlastMode6(istream& input);
  /** read MSAF format */
  void loadMSAF(istream& input);
  /** copy method */
  void copy(const Alignment& orig);
  /** for all sequences, erase portion starting at minPos with 
      length len */
  void cleave(unsigned int minPos, unsigned int len);
  /** insert "-" in target and all templates at position p (also counting the other "-" characters as position) */
  void insertDash(unsigned int p);
  /** insert arbitrary character in target and all templates at position p (also counting the other "-" characters as position) */
  void insertCharacter(unsigned int p, char c);
  /** delete character of position p (or '-') from target and all templates */
  void deletePos(unsigned int p); 
  /** delete character of position p (or '-') from target and all templates */
  void purgeTargetInsertions();
  /** delete character of position p (or '-') from template n 
      and all templates */
  void purgeTargetInsertions(unsigned int n);
  /** delete columns which are only gaps in all positions */
  void purgeGapColumns();
  /** for all sequences, keep only portion starting at minPos with 
      length len */
  void prune(unsigned int minPos, unsigned int len);
  /** combine two multiple sequence alignments of same target */
  void addAlignment(const Alignment& other);

  void swapTemplate(unsigned int index1, unsigned int index2);

  /** delete n'th character */
  string deleteChar(const string& s, unsigned int n); 

  /** delete gaps in n'th sequence. Careful: alignment might be inconsistent after this! */
  void deleteGaps(unsigned int n); 
  /** delete all gaps in all sequences. Careful: alignment might be inconsistent after this! */
  void deleteGaps(); 

  /** delete template sequence n */
  void deleteSequence(unsigned int n);

  /** sets one columns at position pos */
  void setSlice(const string& slice, unsigned int pos);

  /** sets target mode: If true, first line in alignment is treated seperately as target. Default: false. */ 
  void setTargetMode(bool b) { targetMode = b; }
  
  void setWeights(const Vec<double>& v) { seqWeights = v;
    double sum = 0.0;
    for (unsigned int i = 0; i < v.size(); ++i) {
      sum += v[i];
    }
    sum /= v.size();
    ERROR_IF(sum <= 0.0, 
	     "Sum of sequence weights must be greater zero!", exception);
    for (unsigned int i = 0; i < seqWeights.size(); ++i) {
      seqWeights[i] /= sum;
    }
  }

  /** transform sequences to upper case */
  void upperCaseSequences();

  /** transform character in sequences */
  void replaceChar(char cOld, char cNew);

  /** add gap characters at left and right ends */
  void addAllGapEnds(unsigned int numLeft, unsigned int numRight);

  // STATIC

  /* return sequence without "-" characters */
  static string getPureSequence(const string& s);

  /* return position of index if "-" would not be there
     Counting starts from zero. */
  static unsigned int getOrigPos(const string& s, unsigned int p);

  /* return position of original index if "-" are now present */
  static unsigned int getNewPos(const string& s, unsigned int p);

  /* tokenize text : return vector of words of a line of text*/
  static Vec<string> getTokens(const string& text); 

  /* return number between 0.0 and 1.0 (completely conserved) */
  static double computeConservation(const string& slice);


  /* if true, read also target string, otherwise read only templates */
  /* IMPLEMENTATION NOT COMPLETE! */
  bool targetMode;
  bool readScoreMode; // workaround to ignore alignment body
  int outputLineLength; // for saving to for example fasta files

protected:

private:
  /** read output of CE program: */
  void loadCEHeader(istream& input);
  /** compute entropy given a column of an multiple sequence alignment */
  double computeEntropy(const Vec<char>& slice) const;
  /** fix missing characters of template sequences */
  string fixedSequence(const string& ref, const string& cmp);
  /* fill up template sequences with "-" until size equal target sequence length 
     used as workaround for bad Blast output format */
  void fixAlignmentBody();

  /** returns true if correct initialization character is given.
      fastaMode = true: fasta format, otherwise gde format 
      (output of clustalw with option -output=gde
  */
  bool isFastaChar(char c, bool fastaMode) const;
  
  /** saves as typical printable output */
  void savePrintable(ostream& os, unsigned int id, unsigned int pos, unsigned int len) const;

  /** saves single line of for example FASTA type file */
  void sSaveFasta(string t, string tName, ostream& output) const;

  /** add gap characters at left and right ends */
  static string addGapEnds(const string& s, 
			   unsigned int numLeft, unsigned int numRight) {
    string result = s;
    if (numLeft > 0) {
      result = string(numLeft, GAP_CHAR) + result;
    }
    if (numRight > 0) {
      result = result + string(numRight, GAP_CHAR);
    }
    POSTCOND(s.size() + numLeft + numRight == result.size(), exception);
    return result; 
  }

// HELPERS: 

// ATTRIBUTES:
  string target;
  string targetName;
  Vec<string> seqTemplate;
  Vec<string> seqTemplateName;
  Vec<string> eccodes;

  int startAaTarget; // start amino acid. Default: 0
  Vec<int> startAaTemplates; // start amino acid. Default: 0
  Vec<double> score;  // score, eg. (in bits) from blast. Default = 0
  Vec<long double> evalue; // expectation value, eg. from blast. Default = 0
  Vec<double> seqWeights; // weights of sequences. Careful: currently has to be maintained by user!
};

// ---------------------------------------------------------------------------
//                                    Alignment
// -----------------x-------------------x-------------------x-----------------

// PREDICATES:

inline unsigned int 
Alignment::size() const
{
  return seqTemplate.size();
}

inline unsigned int 
Alignment::getLength() const
{ 
  if (seqTemplate.size() > 0) {
    return seqTemplate[0].size();
  }
  return target.length();
}

inline string 
Alignment::getTarget() const
{
  return target;
}

inline string 
Alignment::getTargetName() const
{
  return targetName;
}


// MODIFIERS:

/** offset for sequence counting. Only needed for alignment. Default: 0 */
inline
void 
Alignment::setTargetAminoAcidOffset(int orig)
{
  startAaTarget = orig;
}

/** return offset of n'th template (counting from zero) */
inline
void 
Alignment::setTemplateAminoAcidOffset(unsigned int index, int val)
{
  PRECOND(index < startAaTemplates.size(), exception);
  startAaTemplates[index] = val;
}

/** Assignement operator. */
inline
Alignment& 
Alignment::operator = (const Alignment& orig)
{
  // no precondition
  if (&orig != this)
    {
      copy(orig);
    }
  //   POSTCOND( (orig == *this), exception);
  return *this;
}

inline
string 
Alignment::getEC(unsigned int i) const {
  if ((i >= eccodes.size()) || (eccodes[i].size() == 0) ) {
    return string("0.0.0.0");
  }
  return eccodes[i];
}


// } // namespace


#endif //_ALIGNMENT_H_
