#include <SimpleSequenceAlignment.h>
#include <StringTools.h>
#include <Random.h>
#include <algorithm>

/** Removes all columns that correspond to a gap in sequence with specified number */
void
SimpleSequenceAlignment::collapse(size_type seqId) {
  PRECOND(seqId < size());
  string seq = getSequence(seqId);
  for (int i = static_cast<int>(seq.size())-1; i >= 0; --i) {
    if (isGap(seq[i])) {
      deleteColumn(static_cast<size_type>(i));
    }
  }
}

/** Compress internal state of  sequence alignment.
 *  All methods should still work, albeit slower */
void
SimpleSequenceAlignment::compress() {
  if (isCompressed() || (size() == 0)) {
    return;
  }
  ASSERT(getLength() > 0);
  ASSERT(hasEvenLengths());
  size_type len = getLength();
  compressedCols = sequence_container(len);
  for (size_type i = 0; i < len; ++i) {
    ASSERT(!isCompressed()); // important, otherwise getColumn would already try to use runLengthDecode etc
    compressedCols[i] = runLengthEncode(getColumn(i), ENCODE_LEN_MAX);
  }
  sequences.clear(); // scary!
  compressed = true;
  POSTCOND(len == getLength());
  POSTCOND(isCompressed());
}

/** Compress internal state of  sequence alignment.
 *  All methods should still work, albeit slower */
void
SimpleSequenceAlignment::uncompress() {
  if (!isCompressed()) {
    return;
  }
  compressed = false;
  size_type len = getLength();
  size_type sz = size(); // avoid bug by not calling size and getLength in this function again
  sequences = sequence_container(sz, sequence_type(len, 'X')); 
  for (size_type i = 0; i < getLength(); ++i) {
    sequence_type col = runLengthEncode(getColumn(i), ENCODE_LEN_MAX);
    for (size_type j = 0; j < col.size(); ++j) {
      sequences[j][i] = col[j]; // notice reverse order of indices 
    }  
  }
  compressedCols.clear(); // scary!
  POSTCOND(!isCompressed());
  POSTCOND(len == getLength()); // length has not changed
}

/** counts number of characters and gaps. Careful: characters "X" and "N" are counted as characters. */
void
SimpleSequenceAlignment::countCharacters(sequence_size_type& numChars, sequence_size_type& numGaps,
					 size_type i) const {
  PRECOND(!isCompressed()); 
  for (sequence_size_type j = 0; j < sequences[i].size(); ++j) {
    if (isGap(sequences[i][j])) {
      ++numGaps;
    }
    else {
      ++numChars;
    }
  }
}

/** counts number of characters of type c */
SimpleSequenceAlignment::size_type
SimpleSequenceAlignment::countCharacter(char c) const 
{
  PRECOND(!isCompressed());
  size_type numChars = 0;
  for (size_type i = 0; i < sequences.size(); ++i) {
    for (sequence_size_type j = 0; j < sequences[i].size(); ++j) {
      if (sequences[i][j] == c) {
	++numChars;
      }
    }
  }
  return numChars;
}

SimpleSequenceAlignment::size_type
SimpleSequenceAlignment::countDiCharacter(char c1, char c2) const {
  PRECOND(!isCompressed());
  size_type numChars = 0;
  for (size_type i = 0; i < sequences.size(); ++i) {
    for (sequence_size_type j = 1; j < sequences[i].size(); ++j) {
      if ((sequences[i][j-1] == c1) && (sequences[i][j] == c2)) {
	++numChars;
      }
    }
  }
  return numChars;
}


/** removes column p */
void
SimpleSequenceAlignment::deleteColumn(sequence_size_type p) {
  if (isCompressed()) {
    // remove one column:
    compressedCols.erase(compressedCols.begin() + p);
  } else {
    for (size_type i = 0; i < sequences.size(); ++i) {
      if (p < sequences[i].size()) {
	sequences[i].erase(p, 1); // erases p'th character
      }
    }
  }
}

/** removes all gaps from each sequence. */
void
SimpleSequenceAlignment::deleteGaps() {
  PRECOND(!isCompressed());
  for (size_type i = 0; i < size(); ++i) {
    removeFromString(sequences[i], GAP_CHAR);
  }
}

/** returns index of sequence with specified name. Careful:
 * change from find algorithm to compare !
 */
SequenceAlignment::size_type
SimpleSequenceAlignment::findSequenceByName(const string& name) const
{
  for (sequence_size_type i = 0; i < names.size(); ++i) {
    if (names[i].compare(name) == 0) {
      return i;
    }
  }
  return names.size();
}

/** Finds sequence id, for which key-value pair of properties is fullfilled. If not found returns number of sequences. */
SequenceAlignment::size_type
SimpleSequenceAlignment::findSequenceByProperty(const string& key, const string& value) const {
  if (properties.size() == 0) {
    return size(); // return number of sequences
  }
  for (size_type i = 0; i < properties.size(); ++i) {
    properties_type::const_iterator foundPair = properties[i].find(key);
    if  (foundPair != properties[i].end()) {
      if (foundPair->second == value) {
	return i;
      }
    }
  }
  return properties.size();
}

/** Returns vector of size() elements, each element (string) containing the value corresponding to this sequence and the specified key.
 * If not property was found, use empty string.
 */
Vec<string>
SimpleSequenceAlignment::propertyValues(const string& key) const {
  Vec<string> result(size());
  for (size_type i = 0; i < properties.size(); ++i) {
    properties_type::const_iterator foundPair = properties[i].find(key);
    if  (foundPair != properties[i].end()) {
      result[i] = foundPair->second;
    }
  }
  return result;
}


/** returns true if correct start character is given.
*/
bool
SimpleSequenceAlignment::isFastaChar(char c) const
{
  return (c == '>')||(c == '%') || (c == '#');
}

/** converts '.' charachter to '-' */
void
SimpleSequenceAlignment::normalizeGap() {
  PRECOND(!isCompressed());
  for (size_type i = 0; i < size(); ++i) {
    sequences[i] = translate(sequences[i], GAP_ALT,GAP_CHAR);
  }
}


/** leaves only fragment starting from position start and with specified length. */
void
SimpleSequenceAlignment::prune(sequence_size_type start, sequence_size_type length) {
  PRECOND(!isCompressed()); // FIXIT : not too hard to implement for compressed mode
  for (size_type i = 0; i < size(); ++i) {
    if ((start + length) <= sequences[i].size()) {
      sequences[i] = sequences[i].substr(start, length);
    }
    else if (start < sequences[i].size()) {
      sequences[i] = sequences[i].substr(start);
    }
  }
  referenceSequenceStartCol += start; // adjust index of reference sequence start
}

void
SimpleSequenceAlignment::readFasta(istream& input) {
  clear();
  ASSERT(!isCompressed());
  string tmp;
  tmp = getLine(input);
  SequenceAlignment::size_type count = 0;

  if ((!input) || (tmp.size() == 0)) {
    ERROR("Abnormal input file end.");
  }
  char firstChar = tmp[0];
  if (firstChar == ';') {
    // must be pathetic Stanford format
    tmp = getLine(input); 
  }
  while (input)
    {
      if (tmp.size() == 0) {
	continue;
      }
      if (isFastaChar(tmp[0])) {
	names.push_back(tmp.substr(1));
      }
      else {
	names.push_back(tmp);
      }
      sequences.push_back("");
      while (input) {
	tmp = getLine(input);
	string tmp2 = removeWhiteSpaceFromString(tmp);
	if (tmp2.size() == 0) {
	  continue;
	}
	// cout << "read line: " << tmp << endl;
	if (!input) { 
	  // Warning: This 'if' serves to remove a bug
	  // which causes the last line in a file to be read twice.
	  if ((!isFastaChar(tmp2[0]))
	      && (sequences[count].size()!=getLength())) {
	    sequences[count] += tmp2;
	  }
	  break;    
	}
	if (!isFastaChar(tmp[0])) {
	  sequences[count] += tmp2;
	}
	else {
	  break;
	}
      };
      count++;
    }
  // removes last sequence if it was empty
  if ((size() > 0) && (getSequence(size()-1).size() == 0)) {
    cerr << "Warning: last sequence was empty!" << endl;
    removeSequence(size()-1);
  }
  SequenceAlignment::size_type lastCount = sequences.size()-1;
  SequenceAlignment::size_type lastSizeM = sequences[lastCount].size() -1;
  char lastChar = sequences[lastCount][lastSizeM];
  if (lastChar == '1') {
    sequences[lastCount] = sequences[lastCount].substr(0, lastSizeM); // remove last char
  }
  properties = Vec<SequenceAlignment::properties_type>(sequences.size());
  for (size_type i = 0; i < size(); ++i) {
    properties[i]["assembly"] = names[i]; // workaround
    // properties[i]["chromStart"] = "0"; // start counting at position 0 as is done in UCSC Genome Browser data
    // properties[i]["length"] = uitos(sequences[0].size()); // start counting at position 0 as is done in UCSC Genome Browser data
  }
  ASSERT(validate());
}

/** exchange two characters in sequences */
void
SimpleSequenceAlignment::replace(char cOld, char cNew) {
  PRECOND(!isCompressed());
  for (SequenceAlignment::size_type i = 0; i < sequences.size(); ++i) {
    sequences[i] = translate(sequences[i], cOld, cNew);
  }
}

/** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
char
SimpleSequenceAlignment::computeReverseComplement(char c, const string& fromAlphabet, const string& toAlphabet) {
  PRECOND(fromAlphabet.size() == toAlphabet.size());
  for (string::size_type i = 0; i < fromAlphabet.size(); ++i) {
    if (fromAlphabet[i] == c) {
      return toAlphabet[i];
    }
  }
  return c; // cannot be translated
}

/** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
string
SimpleSequenceAlignment::computeReverseComplement(const string& s, const string& fromAlphabet, const string& toAlphabet) {
  string result = s;
  for (string::size_type i = 0; i < s.size(); ++i) {
    char c = s[(s.size()-i)-1];
    result[i] = computeReverseComplement(c, fromAlphabet, toAlphabet);
  }
  return result;
}

/** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G
 * parsing compatible with MAFAlignment class from locorna package
 */
void
SimpleSequenceAlignment::transformReverseComplement() {
  PRECOND(!isCompressed());
  for (size_type i = 0; i < size(); ++i) {
    sequences[i] = computeReverseComplement(sequences[i], "ACGTU", "TGCAA");
    // adjust start and stop values:
    // parsing compatible with MAFAlignment class from locorna package
    string chromStartString = getSequenceProperty(i, "chromStart"); // entry["chromStart"] = words[2]; // keep it zero-based; do not add one
    string lengthString = getSequenceProperty(i, "length"); // entry["length"] = words[3];
    string strandString = getSequenceProperty(i, "strand"); // entry["strand"] = words[4]; // FIXIT
    string totLengthString = getSequenceProperty(i, "totLength"); // entry["totLength"] = words[5]; // problem
    if ((chromStartString.size() > 0) && (lengthString.size() > 0) && (strandString.size() == 1) && (totLengthString.size() > 0)) {
      long chromStart = stol(chromStartString);
      long length = stol(lengthString);
      long totLength = stol(totLengthString);
      long chromStart2 = totLength - chromStart - length;
      if (strandString[0] == '+') {
	strandString = "-";
      } else if (strandString[0] == '-') {
	strandString = "+";
      }
      setSequenceProperty(i, "chromStart", ltos(chromStart2));
      setSequenceProperty(i, "strand", strandString);
      // length and totLength do not change
    }
  }
}

/** Shuffles sequences horizontally without preserving conservation. */
void
SimpleSequenceAlignment::randomShuffle() 
{
  PRECOND(!isCompressed()); // FIXIT: not too hard to implement for compressed mode
  Random& rnd = Random::getInstance();
  for (size_type i = 0; i < size(); ++i) {
    random_shuffle(sequences[i].begin(), sequences[i].end(), rnd);
  }
}

/** Shuffles sequences horizontally, all rows are shuffled the same way, leaving columns intact. */
void
SimpleSequenceAlignment::randomShuffleHoriz() 
{
  PRECOND(!isCompressed()); // FIXIT: not too hard to implement for compressed mode
  Random& rnd = Random::getInstance();
  Vec<unsigned int> ids(getLength());
  for (size_type i = 0; i < getLength(); ++i) {
    ids[i] = i;
  }
  random_shuffle(ids.begin(), ids.end(), rnd);
  for (size_type i = 0; i < size(); ++i) {
    string s = sequences[i];
    string s2 = s;
    for (size_type j = 0; j < s2.size(); ++j) { 
      s2[j] = s[ids[j]];
    }
    setSequence(s2, i);
  }
}

/** shuffles sequences vertically */
void
SimpleSequenceAlignment::randomShuffleColumns(bool keepFirst) 
{
  PRECOND(!isCompressed()); // FIXIT: not too hard to implement for compressed mode
  Random& rnd = Random::getInstance();
  // DEBUG_MSG("Starting randomShuffleColumns!");
  for (sequence_size_type i = 0; i < getLength(); ++i) {
    sequence_type column = getColumn(i);
    if (keepFirst) {
      if (column.size() > 1) {
	random_shuffle(column.begin() + 1, column.end(), rnd);
	setColumn(column, i);
      }
    } 
    else {
	random_shuffle(column.begin(), column.end(), rnd);
	setColumn(column, i);
    }
  }
  // DEBUG_MSG("Finished randomShuffleColumns!");
}

/** shuffles string such that gap pattern is preserved */
string
SimpleSequenceAlignment::randomShuffleStringGapPreserving(const string& origString) 
{
  string column = origString;
  Random& rnd = Random::getInstance();
  if (column.size() < 2) {
    return column; // nothing to do, only one sequence
  }
  sequence_size_type gapCount = countGaps(column.begin(), column.end());
  string s = removeFromString(column, GAP_CHARS);
  ASSERT(countGaps(s.begin(), s.end()) == 0); // # of gaps should not have changed
  random_shuffle(s.begin(), s.end(), rnd);
  sequence_size_type pos = 0;
  for (string::size_type i = 0; i < s.size(); ++i) {
    while (isGap(column[pos])) {
      ++pos;
    }
    ASSERT(pos < column.size());
    ASSERT(!isGap(column[pos]));
    column[pos] = s[i];
    pos += 1;
  }
  ASSERT(column.size() == origString.size()); // size of output should be equal to size of input string
  ASSERT(gapCount == countGaps(column.begin(), column.end())); // number of gaps should not have changed
#ifndef NDEBUG
  for (string::size_type i = 0; i < column.size(); ++i) {
    ASSERT(isGap(column[i]) == isGap(origString[i])); // gap pattern should be identical
  }
  // if ((column.size() > 5) && (column == origString)) {
  //   cerr << "Warning: gap-preserving shuffling did not change string: " << column << endl; // can legally happen by chance
  // }
#endif
  return column;
}

/** shuffles string such that gap pattern is preserved */
void
SimpleSequenceAlignment::randomShuffleColumnGapPreserving(sequence_size_type col, bool keepFirstFixed) {
  string column = getColumn(col);
  if (column.size() < 2) {
    return; // nothing to do
  }
  string subCol;
  sequence_size_type firstPos = 0;
  if (keepFirstFixed) {
    subCol = column.substr(1, column.size()-1); // without first character
    ASSERT(subCol.size() + 1 == column.size());
    firstPos = 1;
  } else {
    subCol = column;
  }
  subCol = randomShuffleStringGapPreserving(subCol);
  for (sequence_size_type i = 0; i < subCol.size(); ++i) {
    ASSERT(i + firstPos < column.size());
    column[i + firstPos] = subCol[i];
  }  
  // DEBUG_MSG("Finished randomShuffleColumns!");
  setColumn(column, col);
}

/** shuffles string such that gap pattern is preserved */
void
SimpleSequenceAlignment::randomShuffleColumnsGapPreserving(bool keepFirstFixed) {
  for (sequence_size_type i = 0; i < getLength(); ++i) {
    randomShuffleColumnGapPreserving(i, keepFirstFixed);
  }
}

/** convers sequence characters to upper case */
void
SimpleSequenceAlignment::upperCaseSequences()
  {
  PRECOND(!isCompressed()); // FIXIT: not too hard to implement for compressed mode
  for (size_type i = 0; i < size(); ++i) {
    upperCase(sequences[i]);
  }
}


/** writes FASTA formatted file. */
void
SimpleSequenceAlignment::writeFasta(ostream& os) const {
  for (SequenceAlignment::size_type i = 0; i < size(); ++i) {
    os << FASTA_CHAR << names[i] << endl;
    os << getSequence(i) << endl; // FIXIT: slow
  }
}

/** writes defined properties */
void
SimpleSequenceAlignment::writeProperties(ostream& os) const {
  if (properties.size() == size()) {
    for (SequenceAlignment::size_type i = 0; i < size(); ++i) {
      if (properties[i].size() > 0) {
	os << "# " << names[i] << endl;
	const properties_type& prop = properties[i];
	for (properties_type::const_iterator j = prop.begin(); j != prop.end(); j++) {
	  os << j->first << "=" << j->second << endl;
	} 
      }
    }
  } else {
    cout << "# No sequence properties defined!" << endl;
  }
}
