#include <MAFAlignment.h>
#include <NucleotideTools.h>
#include <SequenceAlignment.h>
#include <SequenceAlignmentTools.h>
#include <StringTools.h>
#include <fstream>
#include <ContainerTools.h>

/** Adds constant to all "chromStart" values in all sequences. Important for concatenating alignments so that their coordinates do not overlap. */ 
void
MAFAlignment::addChromStartOffset(length_type offset, const string& assembly) {
  PRECOND(size() == chromStarts.size());
  for (size_type i = 0; i < chromStarts.size(); ++i) {
    for (size_type j = 0; j < chromStarts[i].size(); ++j) {
      chromStarts[i][j] += offset; // FIXIT: maybe only the one assembly chromStart value should be changed?
    }
  }
  refChromStarts.clear(); // mark for recomputing
  updateRefChromStarts();
}

/** Collapses all alignments with respect to sequences of this assembly:
 *  Removes all columns that correspond to a gap in sequence with specified number.
 */
void
MAFAlignment::collapseAssembly(const string& assembly) {
  totalLength = 0;
  for (size_type i = 0; i < size(); ++i) {
    // REMARK << "Collapsing alignment " << (i+1) << endl;
    SequenceAlignment& ali = (*this)[i];
    SequenceAlignment::size_type seqId = getAssemblyRowId(i, assembly); // ali.findSequenceByProperty("assembly", assembly);
    if ((seqId >= 0) && (seqId < ali.size())) {
      // REMARK << "Collapsing with sequence id: "<< seqId << endl;
      ali.collapse(seqId);
    } else {
      REMARK << "Warning: could not collapse alignment " << (i+1) << " because assembly not found: " << assembly << endl;
    }
    totalLength += ali.getLength();
    // REMARK << "done." << endl;
  } 
  updateColumnIds();
  // updateAssemblyToColumnMapping();
}

/** Generates for a given alignment block a unique hash string describing the used sequence assemblies. Adds up alignment lengths 
 * for all alignments corresponding to the same combination of assemblies. */
MAFAlignment::count_hash_type
MAFAlignment::countAssembliesHashLengths() const {
  count_hash_type result;
  for (size_type i = 0; i < size(); ++i) {
    string hash = getAlignmentAssembliesHash(i);
    result[hash] = result[hash] + (*this)[i].getLength();
  }
  return result;
}


/** Shuffle each alignment */
void
MAFAlignment::dinucleotideShuffle(double normLimit, bool shuffleColumnMode) {
  size_type iterations;
  string residues = getResidues();
  for (size_type i = 0; i < size(); ++i) {
    SequenceAlignment::size_type len = ((*this))[i].getLength();
    iterations = 2 * len; // 2 times length of alignment
    if (verbose > 2) {
      REMARK << "Shuffling alignment " << (i+1) << " of length " << len << " with " << iterations << " iterations." << endl;
    }
    SequenceAlignmentTools::dinucleotideShuffle((*this)[i], residues, normLimit, iterations, shuffleColumnMode);
  }
}

/** Shuffle each row of each block such that columns do not remain intact */
void
MAFAlignment::shuffleRows() {
  if (verbose > 2) {
    REMARK << "Starting shuffleRows" << endl;
  }
  for (size_type i = 0; i < size(); ++i) {
    ((*this)[i]).randomShuffle();
  }
  if (verbose > 2) {
    REMARK << "Finished shuffleRows" << endl;
  }
}

/** Shuffle each row of each block such that columns remain intact */
void
MAFAlignment::shuffleHorizontal() {
  if (verbose > 2) {
    REMARK << "Starting shuffleHorizontal" << endl;
  }
  for (size_type i = 0; i < size(); ++i) {
    ((*this)[i]).randomShuffleHoriz();
  }
  if (verbose > 2) {
    REMARK << "Finished shuffleHorizontal" << endl;
  }
}

/** Shuffle each alignment column of each block such that first sequence of each block remains unchanged */
void
MAFAlignment::shuffleVertical() {
  if (verbose > 2) {
    REMARK << "Starting shuffleVertical" << endl;
  }
  bool keepFirstFixed = true;
  for (size_type i = 0; i < size(); ++i) {
    ERROR_IF(getAssemblyRowId(i, getRefAssembly()) != 0,
	     "Vertical shuffling is only implemented for the case that the reference assignment is the first sequence of each alignment block.");
    ((*this)[i]).randomShuffleColumnsGapPreserving(keepFirstFixed);
  }
  if (verbose > 2) {
    REMARK << "Finished shuffleVertical" << endl;
  }
}

/** Returns subsequence for the reference assembly and assembly coordinates. Both start and end are zero-based and inclusive  */
string
MAFAlignment::extractAssemblySequence(length_type assemblyPosStart, length_type assemblyPosEnd) const {
  ERROR_IF(assemblyPosStart > assemblyPosEnd,
	   "Internal error (L108): Start position has to be smaller or equal stop position");
  ERROR_IF(assemblyPosStart < getRefAssemblyBegin(),
	   "Internal error: start position is smaller than smallest defined assembly coordinate");
  ERROR_IF(assemblyPosEnd >= getRefAssemblyEnd(),
	   "Internal error: last position is greater than largest defined assembly coordinate");
  if (verbose > 2) {
    REMARK << "Started extractAssemblySequence with parameters " << assemblyPosStart << " " << assemblyPosEnd << endl;
  }
  length_type internalStart = convertAssemblyPositionToColumnId(assemblyPosStart);
  if (verbose > 2) {
    REMARK << "The internal start position is: " << internalStart << endl;
  }
  if ((internalStart < 0) || (internalStart >= getTotalLength())) {
    if (verbose > 1) {
      REMARK << "Warning: undefined start column: " << assemblyPosStart << " " << internalStart << endl;
    }
    return "";
  }
  length_type internalEnd = convertAssemblyPositionToColumnId(assemblyPosEnd);
  if (verbose > 2) {
    REMARK << "The internal end position is: " << internalEnd << endl;
  }
  if ((internalEnd < 0) || (internalEnd >= getTotalLength())) {
    if (verbose > 1) {
      REMARK << "Warning: undefined end column: " << assemblyPosStart << " " << internalEnd << endl;
    }
    return "";
  }
  string result = "";
  // length_type len = assemblyPosEnd - assemblyPosStart + 1;
  for (length_type id = internalStart; id <= internalEnd; ++id) {  
    if (verbose > 2) {
      REMARK << "Obtaining sequence character from internal position " << id << endl;
    }
    string slice = getSlice(id); // find out correct sequence row
    length_type aliId = getAlignmentId(id);
    size_type rowId = getAssemblyRowId(aliId,refAssembly);
    if ((rowId < 0) || (rowId >= slice.size())) {
      // cannot obtain full sequence
      break;
    }
    result = result + slice[rowId];
  } 
  if (verbose > 2) {
    REMARK << "Finished extractAssemblySequence with parameters " << assemblyPosStart << " " << assemblyPosEnd << endl;
  }
  return result;
}

/** Returns assembly of sequence seqId in alignment aliId. 
 * Assemes format of sequence name: assembly chromStart 
 */
string
MAFAlignment::getSequenceAssembly(length_type aliId, size_type seqId) const {
  PRECOND(aliId < static_cast<length_type>(size()));
  const string& name = (*this)[aliId].getName(seqId);
  ASSERT(name.size() > 0);
  return getTokens(name)[0];
}

Vec<string>
MAFAlignment::getAlignmentAssemblies(length_type aliId) const {
  PRECOND(aliId < static_cast<length_type>(size()));
  const SequenceAlignment& ali = (*this)[aliId];
  Vec<string> result(ali.size());
  for (size_type i = 0; i < result.size(); ++i) {
    result[i] = getSequenceAssembly(aliId, i);
  }
  return result;
}

/** Generates for a given alignment block a unique hash string describing the used sequence assemblies */
string
MAFAlignment::getAlignmentAssembliesHash(length_type aliId) const {
  Vec<string> assemblies = getAlignmentAssemblies(aliId);
  ERROR_IF(assemblies.size() == 0,"Internal error : number of assemblies in each alignment should be greater zero.");
  sort(assemblies.begin(), assemblies.end());
  string result;
  result = assemblies[0];
  for (Vec<string>::size_type i = 1; i < assemblies.size(); ++i) {
    result += "_" + assemblies[i];
  } 
  return result; 
}

/** Returns the position in assembly coordinates. Only works if sequence property "chromStart" is defined,
 * as is the case for alignments that were read in MAF format.
 * Precondition: getAssemblyRowId and getChromStart must be already functional. */
MAFAlignment::length_type
MAFAlignment::getAssemblyPosition(length_type columnId, const string& assembly) const {
  PRECOND(columnId >= 0);
  PRECOND(columnId < getTotalLength());
  // obtain alignment and column in that alignment:
  length_type aliId = 0; 
  length_type aliColId = 0;
  findAlignmentIdColumnId(columnId, &aliId, &aliColId);
  const SequenceAlignment& ali = (*this)[aliId];
  SequenceAlignment::size_type seqId = getAssemblyRowId(aliId, assembly); // ali.findSequenceByProperty("assembly", assembly);
  if ((seqId < 0) || (seqId >= ali.size())) {
    return -1; // undefined position
  }
  length_type assemblyPos = getChromStart(aliId, assembly);;
  ASSERT(assemblyPos >= 0);
  const string& seq = ali.getSequence(seqId); // count how many characters before that column
  for (length_type i = 0; i < aliColId; ++i) {
    if (!NucleotideTools::isGap(seq[i])) {
      ++assemblyPos;
    }
  }
  // check if back conversion works (currently only checked if assembly is reference assembly and position is not gap )  
  if (verbose > 4) {
    REMARK << "Back conversion result: " << refAssembly << " " << seq[aliColId] << " " 
	 << columnId << " " << assemblyPos << " " << convertAssemblyPositionToColumnId(assemblyPos)
	 << endl;
  }
  // ASSERT((assembly != getRefAssembly()) || NucleotideTools::isGap(ali.getSequence(seqId)[aliColId]) || (columnId == convertAssemblyPositionToColumnId(assemblyPos))); 
  return assemblyPos;
}

/** Generates array that allows to convert from absolute assembly position to column id.
 * Precondition: chromStarts must already be defined.*/
bool
MAFAlignment::updateAssemblyToColumnMapping() const {
  string methodName = "updateAssemblyToColumnMapping";
  if (verbose > 0) {
    REMARK << "Starting " << methodName << endl;
  }
  if (refAssembly.size() == 0) {
    REMARK << "Warning: no reference assembly specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  if (size() < 0) {
    REMARK << "Warning: no alignments specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  assemblyToColumnOffset = getChromStart(0, refAssembly);
  length_type lastAssemblyPos = getChromStart(size() - 1, refAssembly) + (*this)[size() -1].getLength();
  length_type assemblyPosNums = lastAssemblyPos - assemblyToColumnOffset;
  ASSERT(assemblyPosNums > 0);
  length_type n = getTotalLength(); 
  if (static_cast<length_type>(assemblyToColumnMapping.size()) != assemblyPosNums) {
    assemblyToColumnMapping = Vec<length_type>(assemblyPosNums, n);
  } else {
    for (length_type i = 0; i < assemblyPosNums; i++) {
      assemblyToColumnMapping[i] = n;
    }
  }
  for (length_type i = 0; i < n; i++) {
    length_type assemblyPos = getAssemblyPosition(i, refAssembly);
    Vec<length_type>::size_type k = assemblyPos - assemblyToColumnOffset;
    ASSERT(k >= 0);
    ASSERT(k < assemblyToColumnMapping.size());
    if (i < assemblyToColumnMapping[k]) {
      assemblyToColumnMapping[k] = i; // do not set higher possibilities; these correspond to gaps with same mapping result
    }
  }
  if (verbose > 3) {
    REMARK << "Assembly Offset:" << assemblyToColumnOffset << endl;
    for (size_type i = 0; i < assemblyToColumnMapping.size(); ++i) {
      REMARK << i << " " << assemblyToColumnMapping[i] << endl;
      // ASSERT((assembly != getRefAssembly()) || NucleotideTools::isGap(ali.getSequence(seqId)[aliColId]) || (columnId == convertAssemblyPositionToColumnId(assemblyPos))); 
    }
 }
  if (verbose > 0) {
    REMARK << "Finished " << methodName << endl;
  }
  return true;
}

/** Generates array that allows to convert from absolute assembly position to column id.
 * Precondition: chromStarts must already be defined.*/
MAFAlignment::length_type
MAFAlignment::convertAssemblyPositionToColumnIdFast(length_type assemblyId) const {
  string methodName = "convertAssemblyPositionToColumnIdFast";
  string assembly = refAssembly;
  if (verbose > 2) {
    REMARK << "Starting " << methodName << endl;
  }
  if (refAssembly.size() == 0) {
    REMARK << "Warning: no reference assembly specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  if (size() < 0) {
    REMARK << "Warning: no alignments specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  // lazy evaluation:
  if (assemblyToColumnMapping.size() == 0) {
    updateAssemblyToColumnMapping();
  }
  length_type k = assemblyId - assemblyToColumnOffset;
  ASSERT(k >= 0);
  ASSERT(k < static_cast<length_type>(assemblyToColumnMapping.size()));
  return assemblyToColumnMapping[assemblyId];
}

/** Generates array that allows to convert from absolute assembly position to column id.
 * Precondition: chromStarts must already be defined.*/
bool
MAFAlignment::updateRefChromStarts() const {
  if (size() == 0) {
    if (refChromStarts.size() > 0) {
      refChromStarts.clear();
    }
    return false;
  }
  if (refChromStarts.size() == size()) {
    return false; // assume everything ok
  }
  if (verbose > 2) {
    REMARK << "Starting updateRefChromStarts..." << endl;
  }
  refChromStarts = Vec<length_type>(size()); 
  for (size_type i = 0; i < size(); ++i) {
    refChromStarts[i] = getChromStart(i, refAssembly);
  }
  if (verbose > 2) {
    REMARK << "Finished updateRefChromStarts." << endl;
  }
  return true;
}

/** Generates array that allows to convert from absolute assembly position (external (UCSC) counting with zero-based starts and 1-based ends) to internal column id (the how-many'th column is it in internal representation, zero-based).
 * Precondition: chromStarts must already be defined.*/
MAFAlignment::length_type
MAFAlignment::convertAssemblyPositionToColumnIdSlow(length_type assemblyId) const {
  string methodName = "convertAssemblyPositionToColumnIdSlow";
  string assembly = refAssembly;
  if (verbose > 2) {
    REMARK << "Starting " << methodName << endl;
  }
  if (refAssembly.size() == 0) {
    REMARK << "Warning: no reference assembly specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    ASSERT(false);
    return getTotalLength();
  }
  if (size() <= 0) {
    REMARK << "Warning: no alignments specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    ASSERT(false);
    return getTotalLength();
  }
  updateRefChromStarts();
  // length_type n = getTotalLength();
  // length_type currPos = 0;
  length_type aliId = 0;
  // find alignment
  Vec<length_type>::iterator it = lower_bound(refChromStarts.begin(), refChromStarts.end(), assemblyId);
  if (it != refChromStarts.end()) {
    aliId = distance(refChromStarts.begin(), it);
    if (*it != assemblyId) {
      --aliId;
    } else {
      return colStarts[aliId];
    }
  } else {
    aliId = static_cast<length_type>(size())-1; // must be in last alignment!? FIXIT add assertion
  }
//   for (aliId = aliMin; aliId < static_cast<length_type>(size()); aliId++) {
//     chromStart = getChromStart(aliId, assembly);
//     if (chromStart > assemblyId) {
//       ASSERT(aliId > 0);
//       --aliId ;
//       break;
//     } else if (chromStart == assemblyId) {
//       ASSERT(colStarts[aliId] == convertAssemblyPositionToColumnIdVerySlow(assemblyId));
//       return colStarts[aliId];
//     }
//     // colCount += (*this)[aliId].getLength(); // total number of column so far
//   }
//   if (aliId == static_cast<length_type>(size())) {
//     --aliId; // must be in last alignment!? FIXIT add assertion
//   }
  if (verbose > 2) {
    REMARK << "Found alignment id " << aliId << " from defined " << colStarts.size() << endl;
  }
  if (aliId < 0) {
    REMARK << "Could not find assembly id " << assemblyId << " in " << refChromStarts << endl;
    REMARK << "Column starts: " << colStarts << endl;
    ERROR("Severe internal error encountered in convertAssemblyPositionToColumnIdSlow");
  } 
  length_type colCount = colStarts[aliId];
  length_type chromStart = getChromStart(aliId, assembly);
  ASSERT(colCount == colStarts[aliId]);
  // ASSERT(aliId < static_cast<length_type>(size())); // must be found somewhere
  const SequenceAlignment& ali = (*this)[aliId];
  SequenceAlignment::sequence_size_type aliColId = ali.getLength();
  SequenceAlignment::size_type seqId = getAssemblyRowId(aliId, assembly); // ali.findSequenceByProperty("assembly", assembly);
  if ((seqId < 0) || (seqId >= ali.size())) {
    ASSERT(false);
    return -1; // undefined position
  }
  ASSERT(chromStart == getChromStart(aliId, assembly));
  length_type assemblyPos = chromStart; 
  length_type result = 0;
  ASSERT(assemblyPos >= 0);
  const string& seq = ali.getSequence(seqId); // count how many characters before that column
  ASSERT(aliColId == seq.size());
  for (length_type i = 0; i < static_cast<length_type>(aliColId); ++i) {
    if (assemblyId == assemblyPos) {
      result = colCount + i;
// #ifndef NDEBUG
//       if (verbose > 2) { // very slow tests
// 	if(result != convertAssemblyPositionToColumnIdVerySlow(assemblyId)){
// 	  length_type testAliId, testColId;
// 	  findAlignmentIdColumnId(colCount, &testAliId, &testColId);
// 	  cout << "Strange: " << result << " " << convertAssemblyPositionToColumnIdVerySlow(assemblyId) << " " << colCount << " " << assemblyId << " " 
// 	       << getChromStart(aliId, assembly) << " " << getChromStart(aliId+1, assembly) << " ali-id: " << aliId 
// 	       << " " << testAliId << " col-id:" << testColId << endl;
// 	}
// 	ASSERT(result == convertAssemblyPositionToColumnIdVerySlow(assemblyId));
//       }
// #endif
      return result;
    }
    if (!NucleotideTools::isGap(seq[i])) {
      ++assemblyPos;
    }
  }
  result = assemblyPos;
  //  ASSERT(result == convertAssemblyPositionToColumnIdFast(assemblyId));
  //  ASSERT(false); // should never be here;
  if (verbose > 1) {
    REMARK << "Warning: problem with computing internal column id for assembly position " << assemblyId << endl;
  }
  return result;
}

/** Generates array that allows to convert from absolute assembly position to column id.
 * Precondition: chromStarts must already be defined.*/
MAFAlignment::length_type
MAFAlignment::convertAssemblyPositionToColumnIdVerySlow(length_type assemblyId) const {
  string methodName = "convertAssemblyPositionToColumnIdVerySlow";
  string assembly = refAssembly;
  if (verbose > 1) {
    REMARK << "Starting " << methodName << endl;
  }
  if (refAssembly.size() == 0) {
    REMARK << "Warning: no reference assembly specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  if (size() < 0) {
    REMARK << "Warning: no alignments specified. Could not generate mapping from reference assembly positions to internal column ids. " << endl;
    return false;
  }
  length_type n = getTotalLength();
  for (length_type i = 0; i < n; ++i) {
    if (assemblyId == getAssemblyPosition(i , assembly)) {
      return i;
    }
  }
  return n;
}

/** Prepares for future unload operations (i.e. temporary storing of alignment blocks to disk in order to save memory) */
void
MAFAlignment::initUnload(const string& dir, const string& prefix) {
  PRECOND((size() > 0) && (dir.size() > 0) && (prefix.size() > 0));
  unloadDir = dir;
  unloadPrefix = prefix;
  unloadIds = Vec<size_type>(size(), 0); // sets array with "1" for each unloaded alignment
  ASSERT(isUnloadable());
}

string
MAFAlignment::unloadAlignmentName(size_type aliId) const {
  return unloadDir + SLASH + unloadPrefix + "block_" + uitos(aliId) + ".maf";
}

/** Unload individual alignment block (read from tmp disk space) */
void
MAFAlignment::uploadAlignment(size_type aliId) const {
  ASSERT(false); // FIXIT: code not ready yet
//   ifstream ifs(unloadAlignmentName(aliId).c_str());
//   ERROR_IF(!ifs, "Error reading from temporary file!");
//   SimpleSequenceAlignment dummyAlignment;
//   SequenceAlignment * ali = const_cast<SequenceAlignment*>( &((*this)[aliId]));
//   // (*this)[aliId] = dummyAlignment; // effectively delete this block from RAM! Looses properties objects!
//   *ali = dummyAlignment;
//   ali->readFasta(ifs);
//   ifs.close();
//  unloadIds[aliId] = ALIGNMENT_UPLOADED;
}

/** Unload individual alignment block (write to tmp disk space, delete in memory) */
void
MAFAlignment::unloadAlignment(size_type aliId) const {
  ASSERT(false); // code not yet ready FIXIT
//   ofstream ofs(unloadAlignmentName(aliId).c_str());
//   ERROR_IF(!ofs, "Error writing to temporary file!");
//   ((*this)[aliId]).writeFasta(ofs);
//   ofs.close();
//   SimpleSequenceAlignment dummyAlignment;
//   SequenceAlignment * ali = const_cast<SequenceAlignment*>( &((*this)[aliId]));
//   // (*this)[aliId] = dummyAlignment; // effectively delete this block from RAM! Looses properties objects!
//   *ali = dummyAlignment;
//   unloadIds[aliId] = ALIGNMENT_UNLOADED;
}

void
MAFAlignment::unload() const {
  for (size_type i = 0; i < size(); ++i) {
    const SequenceAlignment& ali = (*this)[i];
    if ((static_cast<length_type>(ali.getLength()) * static_cast<length_type>(ali.size())) > unloadCharLimit) {
      unloadAlignment(i);
    }
  }
}
  
/** Updates the reference assembly to be the first one from the first sequence of the first alignment block, unless it was previously specified. Returns true if changed. */
bool
MAFAlignment::updateRefAssembly() {
  PRECOND(size() > 0);
  if (refAssembly.size() == 0) {
    const SequenceAlignment& ali = (*this)[0];
    // const SequenceAlignment::properties_type properties = ali.getSequenceProperties(0);
    // SequenceAlignment::properties_type::const_iterator foundId = properties.find("assembly");
    // if (foundId != properties.end()) {
    refAssembly = getTokens(ali.getName(0))[0]; // foundId -> second;
    ASSERT(refAssembly.size() > 0);
    return true;
    //     } else {
    //       REMARK << "No reference assembly could be specified!" << endl;
    //       return false;
    //     }
  }
  return false;
}

/** Returns in which row a certain assembly for a certain alignment can be found. Otherwise return size of that alignment. */
MAFAlignment::size_type
MAFAlignment::getAssemblyRowId(length_type alignmentId, const string& assembly) const {
  PRECOND(alignmentId >= 0);
  PRECOND(alignmentId < static_cast<length_type>(assemblyRowIdMaps.size()));
  PRECOND(alignmentId < static_cast<length_type>(size()));
  if (assemblyRowIdMaps[alignmentId].size() == 0) {
    return (*this)[alignmentId].size(); // returns number of sequences of that alignment if not found
  }
  if ((assemblyRowIdMaps[alignmentId].begin())->first == assembly) {
    // most likely one tests for the reference assembly, most likely that is the first entry:
    return (assemblyRowIdMaps[alignmentId].begin())->second; 
  }
  map<string, row_type>::const_iterator foundIter = assemblyRowIdMaps[alignmentId].find(assembly);
  if (foundIter == assemblyRowIdMaps[alignmentId].end()) {
    return (*this)[alignmentId].size(); // returns number of sequences of that alignment if not found
  }
  return foundIter -> second; // return stored index
}

/** Returns chromStart value of MAF alignment. If assembly is not defined, return -1. If chromStart is not defined, return 0. */
// MAFAlignment::length_type
// MAFAlignment::getOriginalChromStart(length_type alignmentId, const string& assembly) const {
//   const SequenceAlignment& ali = (*this)[alignmentId];
//   SequenceAlignment::size_type seqId = getAssemblyRowId(alignmentId, assembly);
//   if ((seqId < 0) || (seqId >= ali.size())) {
//     return -1; // undefined position
//   }
//   string chromStartString = ali.getSequenceProperty(seqId, "chromStart");
//   length_type chromStart = 0;
//   if (chromStartString.size() > 0) {
//     chromStart = stoi(chromStartString); // if not defined, assume start at zero
//   }
//   return chromStart;
// }

/** Returns chromStart value of MAF alignment. If assembly is not defined, return -1. If chromStart is not defined, return 0. */
MAFAlignment::length_type
MAFAlignment::getChromStart(length_type alignmentId, const string& assembly) const {
  const SequenceAlignment& ali = (*this)[alignmentId];
  SequenceAlignment::size_type seqId = getAssemblyRowId(alignmentId, assembly);
  if ((seqId < 0) || (seqId >= ali.size())) {
    return -1; // undefined position
  }
  return chromStarts[alignmentId][seqId];;
}

/** Sets chromStart value of sequence for a specified MAF block and a specified alignment */
bool
MAFAlignment::setChromStart(length_type alignmentId, const string& assembly, length_type chromStart) {
  PRECOND((alignmentId >= 0) && (alignmentId < static_cast<length_type>(size())));
  SequenceAlignment& ali = (*this)[alignmentId];
  SequenceAlignment::size_type seqId = getAssemblyRowId(alignmentId, assembly); // get row in alignment for this assembly
  if ((seqId < 0) || (seqId >= ali.size())) {
    return false; // undefined position
  }
  chromStarts[alignmentId][seqId] = chromStart;
  return true;
}

/** Returns completely assembled sequence of one assembly. Gaps corresponding to areas that are not
 * present in data */
string
MAFAlignment::generateAssemblySequence(const string& assembly) const {
  string result(getTotalLength(), GAP_CHAR);
  length_type n = getTotalLength();
  for (length_type i = 0; i < n; ++i) {
    ASSERT((i >= 0) && (i < getTotalLength()));
    length_type aliId = getAlignmentId(i); // mafAliIds[i]
    size_type rowId = getAssemblyRowId(aliId, assembly);
    if ((rowId >= 0) && (rowId < (*this)[aliId].size())) {
      result[i] = getSlice(i)[rowId];
    }
  }
  ASSERT(static_cast<length_type>(result.size()) == getTotalLength());
  return result;
} 

/** Returns set of assemblies, that are in common between two alignments */
set<string>
MAFAlignment::getCommonAssemblies(length_type alignmentId1, length_type alignmentId2) const {
  const map<string, row_type>& map1 = getAssemblyRowIdMap(alignmentId1);
  const map<string, row_type>& map2 = getAssemblyRowIdMap(alignmentId2);
  set<string> result;
  for (map<string,row_type>::const_iterator it = map1.begin(); it != map1.end(); it++) {
    if (map2.find(it->first) != map2.end()) {
      result.insert(it->first);
    }
  }
  // sort(result.begin(), result.end());
  return result;
}

/** Returns set of assemblies, that are in common between two alignments from two mafs */
set<string>
MAFAlignment::getCommonAssemblies(length_type alignmentId1, const MAFAlignment& maf2, length_type alignmentId2) const {
  const map<string, row_type>& map1 = getAssemblyRowIdMap(alignmentId1);
  const map<string, row_type>& map2 = maf2.getAssemblyRowIdMap(alignmentId2);
  set<string> result;
  for (map<string,row_type>::const_iterator it = map1.begin(); it != map1.end(); it++) {
    if (map2.find(it->first) != map2.end()) {
      result.insert(it->first);
    }
  }
  // sort(result.begin(), result.end());
  return result;
}

/** Returns hash for set of assemblies, that are in common between two alignments */
string
MAFAlignment::getCommonAssembliesHash(length_type alignmentId1, length_type alignmentId2) const {
  const map<string, row_type>& map1 = getAssemblyRowIdMap(alignmentId1);
  const map<string, row_type>& map2 = getAssemblyRowIdMap(alignmentId2);
  Vec<string> result;
  for (map<string,row_type>::const_iterator it = map1.begin(); it != map1.end(); it++) {
    if (map2.find(it->first) != map2.end()) {
      result.push_back(it->first);
    }
  }
  sort(result.begin(), result.end());
  ERROR_IF(result.size() == 0, "Internal error: could not compute hash for common assemblies between alignment blocks.");	   
  string resultString = result[0];
  for (Vec<string>::size_type i = 1; i < result.size(); ++i) {
    resultString += "_" + result[i];
  } 
  return resultString;   
}

/** Returns hash for set of assemblies, that are in common between two alignments */
string
MAFAlignment::getCommonAssembliesHash(length_type alignmentId1, const MAFAlignment& maf2, length_type alignmentId2) const {
  const map<string, row_type>& map1 = getAssemblyRowIdMap(alignmentId1);
  const map<string, row_type>& map2 = maf2.getAssemblyRowIdMap(alignmentId2);
  Vec<string> result;
  for (map<string,row_type>::const_iterator it = map1.begin(); it != map1.end(); it++) {
    if (map2.find(it->first) != map2.end()) {
      result.push_back(it->first);
    }
  }
  sort(result.begin(), result.end());
  ERROR_IF(result.size() == 0, "Internal error: could not compute hash for common assemblies between alignment blocks.");	   
  string resultString = result[0];
  for (Vec<string>::size_type i = 1; i < result.size(); ++i) {
    resultString += "_" + result[i];
  } 
  return resultString;   
}

/** Returns slice of n'th column such that it contains the nucleotides corresponding to the specified set. */
MAFAlignment::sequence_type
MAFAlignment::getSlice(length_type columnId, const set<string>& assemblies) const {
  PRECOND(columnId >= 0);
  PRECOND(columnId < getTotalLength());
  string origSlice = getSlice(columnId);
  length_type aliId = getAlignmentId(columnId);
  string result(assemblies.size(), GAP_CHAR);
  ASSERT(result.size() == assemblies.size());
  ASSERT(origSlice.size() == (*this)[aliId].size());
  string::size_type pc = 0;
  for (set<string>::const_iterator i = assemblies.begin(); i != assemblies.end(); i++) {
    size_type rowId = getAssemblyRowId(aliId, *i); // return what row this assembly corresponds to for this alignment
    if (rowId < origSlice.size()) {
      result[pc] = origSlice[rowId];
    }
    pc++;
 }
  return result;
}

/** Updates alignmenIds and alignmentColumnIds attributes */
void
MAFAlignment::updateColumnIds() {
  if (static_cast<length_type>(colStarts.size()) != totalLength) {
    // alignmentIds = Vec<length_type>(totalLength, 0);
    colStarts = Vec<length_type>(size(), totalLength);
  }
//   if (static_cast<length_type>(alignmentColumnIds.size()) != totalLength) {
//     alignmentColumnIds = Vec<length_type>(totalLength, 0);
//   }
  length_type pc = 0;
  for (size_type i = 0; i < size(); ++i) {
    // REMARK << "Working on alignment " << (i+1) << " out of " << size() << endl;
    // REMARK << (*this)[0].getLength() << endl;
    // REMARK << "Hey" << endl;
    colStarts[i] = pc;
    pc += (*this)[i].getLength();
//     for (SequenceAlignment::size_type j = 0; j < n; ++j) {
//       ASSERT(pc < totalLength);
//       // REMARK << "Working on alignment column " << (j+1) << " out of " << n << endl;
//       // alignmentIds[pc] = static_cast<length_type>(i);
//       // alignmentColumnIds[pc] = static_cast<length_type>(j);
//       pc++;
//     }
  }
  ASSERT(colStarts.size() == size());
  ASSERT(ContainerTools::isStrictlySorted(colStarts.begin(), colStarts.end()));
  ASSERT(pc == totalLength);
  // ASSERT(alignmentIds.size() == alignmentColumnIds.size());
  // ASSERT(static_cast<length_type>(alignmentIds.size()) == totalLength);
  ASSERT(validateColumnIds());
}

/** Updates alignmenIds and alignmentColumnIds attributes */
bool
MAFAlignment::validateColumnIds() const {
  string methodName = "validateColumnIds";
  if (verbose > 1) {
    REMARK << "Starting validateColumnIds..." << endl;
  }
  if (colStarts.size() != size()) {
    cout << "ERROR: " << colStarts.size() << " : " << size() << endl;
    ERROR("Internal error in MAFAlignment::validateColumnIds: colStarts.size() != size()");
  }
  ASSERT(colStarts.size() == size());
  if (!ContainerTools::isSorted(colStarts.begin(), colStarts.end())) {
    cout << "Warning: column indices are not sorted properly!" << endl;
    cout << colStarts << endl;
  }
  ASSERT(ContainerTools::isSorted(colStarts.begin(), colStarts.end()));
  length_type pc = 0; 
  length_type id1, id2;
  for (size_type i = 0; i < size(); ++i) {
    if (verbose > 2) {
      REMARK << methodName << " : working on block " << (i+1) << endl;
    }
    const SequenceAlignment& ali = (*this)[i];
    SequenceAlignment::size_type n = ali.getLength();
    for (SequenceAlignment::size_type j = 0; j < n; ++j) {
      // if (verbose > 2) {
      // cout << "Working on column " << (j+1) << endl;
      // }
      if (pc >= totalLength) {
	ERROR("Internal error in MAFAlignment::validateColumnIds: pc >= totalLength.");
      }
      ASSERT(pc < totalLength);
      // REMARK << "Working on alignment column " << (j+1) << " out of " << n << endl;
      // alignmentIds[pc] = static_cast<length_type>(i);
      // alignmentColumnIds[pc] = static_cast<length_type>(j);
      if (verbose > 6) {
	REMARK << "starting findAlignmentIdColumnId " << endl;
      }
      findAlignmentIdColumnId(pc, &id1, &id2);
      if (verbose > 6) {
	REMARK << "finished findAlignmentIdColumnId " << endl;
      }
      if ((id1 != static_cast<length_type>(i)) || (id2 != static_cast<length_type>(j))) {
	if (verbose > 0) {
	  REMARK << "Bad column index found: " << i << " " << j << " " << id1 << " " << id2 << " " << pc << endl;
	}
	return false;
      }
      pc++;
    }
  }
  ASSERT(pc == totalLength);
  if (verbose > 1) {
    REMARK << "Finished validateColumnIds..." << endl;
  }
  return true;
}

/** Stores for certain MAF block the sequence indices for each assembly as a key-value pair map.
 *  Precondition: assembly names must be first word of all sequence names */
void
MAFAlignment::updateAssemblyRowIdMap(size_type aliId) {
  PRECOND(aliId < size());
  PRECOND(aliId < assemblyRowIdMaps.size());
  const SequenceAlignment& ali = (*this)[aliId];
  assemblyRowIdMaps[aliId].clear();
  for (size_type i = 0; i < ali.size(); ++i) {
    // const SequenceAlignment::properties_type properties = ali.getSequenceProperties(i);
    string name = ali.getName(i); // expect name to be in format : assembly chromStart   [with one space inbetween]
    ASSERT(name.size() > 0);
    string assembly = getTokens(name)[0];
    ASSERT(assembly.size() > 0);
    // SequenceAlignment::properties_type::const_iterator foundId = properties.find("assembly");
    // if (foundId != properties.end()) {
    // assemblyRowIdMaps[aliId][foundId -> second] = i;
    assemblyRowIdMaps[aliId][assembly] = i;
    // }
  }
  ASSERT(assemblyRowIdMaps[aliId].size() == ali.size()); // each sequence must have had a property "assembly", they must all be different
}

/** Updates alignmenIds and alignmentColumnIds attributes */
void
MAFAlignment::updateAssemblyRowIdMaps() {
  if (assemblyRowIdMaps.size() != size()) {
    assemblyRowIdMaps = Vec<map<string, row_type> >(size());
  }
  for (size_type i = 0; i < size(); ++i) {
    updateAssemblyRowIdMap(i);
  }
  ASSERT(assemblyRowIdMaps.size() == size());
}

/** Creates chromStarts datastructure */
void
MAFAlignment::updateChromStarts() {
  if (verbose > 1) {
    REMARK << "Starting updateChromStarts... ";
  }
  chromStarts = Vec<Vec<length_type> >(size());
  for (size_type i = 0; i < chromStarts.size(); ++i) { // loop over MAF blocks
    const SequenceAlignment& ali = (*this)[i];
    chromStarts[i] = Vec<length_type>(ali.size());
    for (SequenceAlignment::size_type j = 0; j < ali.size(); ++j) { // loop over sequences
      string name = ali.getName(j);
      vector<string> words = getTokens(name);
      ERROR_IF(words.size() != 2, "Alignment parsing error: expected two words in sequence name: " + name);
      length_type chromStart = stoi(words[1]);
      chromStarts[i][j] = chromStart;
    }
  }
  POSTCOND(ContainerTools::isSorted(chromStarts.begin(), chromStarts.end()));
  if (verbose > 1) {
    cout << "done." << endl;
  }
}

bool
MAFAlignment::validate() const {
  return (totalLength > 0) && (size() > 0) 
    && (size() == assemblyRowIdMaps.size()) 
    && (size() == chromStarts.size())
    && (size() == colStarts.size())
    && (assemblies.size() > 0);
}

void
MAFAlignment::read(istream& is, const BEDRegions& bed, size_type blockMin, size_type blockMax) {
  char c;
  is >> c;
  is.putback(c);
  if (c == '>') {
    if (verbose > 0) {
      REMARK << "Reading sequences in FASTA format...";
    }
    readFASTA(is);
  } else {
    if (c != '#') {
      if (verbose > 0) {
	REMARK << "Warning: Expected # as first character in first line. Nonetheless, assuming MAF format file." << endl;
      }
    }
    if (getRefAssembly().size() == 0) {
      REMARK << "No reference assembly specified before reading MAF blocks!" << endl;
    }
    readMAF(is, bed, blockMin, blockMax);
  } 
  if (verbose > 0) {
    REMARK << "Read " << size() << " alignment blocks with a total number of " << getTotalLength() << " columns." << endl;
  }
}

/** Uses assembly and chromStart sequence properties to redefined sequence names. Used to save memory.
 * If a reference assembly is provided (string size > 0), it returns "chromStart" as string */
bool
MAFAlignment::prepareAlignment(SequenceAlignment& ali, const BEDRegions& bed, const string& refAssembly) const {
  bool result = false; // !bed.validate();
  // const string& refAssembly = bed.getAssembly();
  bool bedValidates = bed.validate();
  bool refFound = false;
  for (SequenceAlignment::size_type i = 0; i < ali.size(); ++i) {
    string assembly = ali.getSequenceProperty(i, "assembly");
    ERROR_IF(assembly.size() == 0, "No assembly defined in alignment with sequence " + ali.getName(i));
    string chromStartString = ali.getSequenceProperty(i, "chromStart");
    if (chromStartString.size() == 0) {
      REMARK << "Warning: no ChromStart property defined in sequence with name " << ali.getName(i) << endl;
      chromStartString = "0";
    }
    ali.setName(assembly + " " + chromStartString, i);
    if (refAssembly == assembly) {
      refFound = true;
      string chrom = ali.getSequenceProperty(i, "chrom");
      ERROR_IF(chrom.size() == 0, "No chrom property defined in alignment: " + assembly + " " + chrom);
      SequenceAlignment::sequence_size_type chromStart = stol(chromStartString);
      SequenceAlignment::sequence_size_type numChars = 0;
      SequenceAlignment::sequence_size_type numGaps = 0;
      ali.countCharacters(numChars, numGaps, i);
      SequenceAlignment::sequence_size_type chromEnd = chromStart + numChars - 1; // last index of this sequence on chromosome
      ali.setReferenceSequenceStartCol(chromStart);
      if (bedValidates) {
	result = bed.isOverlapping(chrom, IntervallInt(chromStart, chromEnd));
      } else {
	result = refFound;
      }
      if (verbose > 4) {
	REMARK << "Result of overlap of : " << chromStart << " " << chromEnd << " : " << result << endl;
      }
      // ASSERT(assembly == bed.getAssembly());
    }
  }
  if (!refFound) {
    result = false; // no reference sequence found, cannot use alignment!
  }
  return result;
}

void
MAFAlignment::readMAF(istream& is, const BEDRegions& bed, size_type blockMin, size_type blockMax) {
  // ASSERT(getRefAssembly().size() > 0);
  string methodName = "readMaf";
  if (verbose > 1) {
    REMARK << "Reading genomic alignment in MAF format with strand mode " << strandMode << endl;
    if (requiredAssemblies.size() > 0) {
      REMARK << "Required assemblies: ";
      for (set<string>::const_iterator it = requiredAssemblies.begin(); it != requiredAssemblies.end(); it++) {
	cout << *it << " ";
      }
      cout << endl;
    }
    if (tabooAssemblies.size() > 0) {
      REMARK << "Ignored assemblies: ";
      for (set<string>::const_iterator it = tabooAssemblies.begin(); it != tabooAssemblies.end(); it++) {
	cout << *it << " ";
      }
      cout << endl;
    }
  }
  Vec<string> lines = getLines(is);
  if (verbose > 0) {
    REMARK << " Read " << lines.size() << " lines" << endl;
  }
  pc = 0; // reset line counter
  clear(); // remove all current alignments
  assemblies.clear();
  totalLength = 0; // total number of columns, class member attribute
  string refAss = getRefAssembly(); // bed.getAssembly();
  size_type blockCount = 0; // counts read alignment blocks
  while (pc < lines.size()) {
    if ((lines[pc].size() > 3) && (!((lines[pc][0] == 'a') && (lines[pc][1] == ' ')))) { 
      // substr(lines[pc],1,2) != "a ") {
      ++pc;
      continue; // no point in invoking subroutine
    }
    alignment_imp_type ali = readAlignment(lines, pruneAfter, requiredAssemblies, tabooAssemblies);
    ERROR_IF(ali.size() == 0, "Strange: The first MAF block seems to have no sequence data!");
    if (refAss.size() == 0) {
      // get reference assembly from first found sequence: // code duplication compared to readAlignment method
      refAss = ali.getSequenceProperty(0, "assembly");
      refAssemblyChrom = ali.getSequenceProperty(0, "chrom");
      refAssemblyTotLength = stol(ali.getSequenceProperty(0, "totLength"));
      setRefAssembly(refAss);
      REMARK << "Since no reference assembly was specified, using assembly of first found sequence as reference assembly: " 
	     << refAss << " chromosome: " << refAssemblyChrom << " " << refAssemblyTotLength << endl;
    }
    if (shuffleMode) {
      ali.randomShuffle();
    }
    ++blockCount;
    if (! ((blockCount > blockMin) && ((blockMax<1) || (blockCount <= blockMax)))) {
      continue; // outside of block range
    }
    if (ali.size() >= seqMin) {
      ali.upperCaseSequences();
      if (isPlusStrand(ali, refAss) == STRAND_UNKNOWN) {
	REMARK << "No strand direction specified for alignment: " << endl;
	cout << ali << endl;
	for (SequenceAlignment::size_type i = 0; i < ali.size(); ++i) {
	  cout << "Sequence " << (i+1) << endl;
	  SequenceAlignment::properties_type prop = ali.getSequenceProperties(i);
	  for (SequenceAlignment::properties_type::const_iterator it = prop.begin(); it != prop.end(); it++) {
	    cout << it->first << " : " << it->second << endl;
	  }
	}
      }
      if (strandMode == STRAND_PLUS) {
	if (isPlusStrand(ali, refAss) == STRAND_MINUS) {
	  ali.transformReverseComplement();
	  if (verbose > 1) {
	    REMARK << "Using reverse complement of MAF block (internal representation) " << (size() + 1) << " : " 
		   << isPlusStrand(ali, refAss) << endl;
	  } 
	  if (verbose > 2) {
	    REMARK << "Resulting alignment:" << endl;
	    cout << ali << endl;
	    ali.writeProperties(cout);
	  }
	} else if (verbose > 2) {
	  REMARK << "Not using reverse complement of MAF block (internal representation) " << (size() + 1) << " : " 
		 << isPlusStrand(ali, refAss) << endl;
	}
      } else if (strandMode == STRAND_MINUS) {
	if (isPlusStrand(ali, refAss) == STRAND_PLUS) {
	  ali.transformReverseComplement();
	  if (verbose > 1) {
	    REMARK << "Using reverse complement of MAF block (internal representation) " << (size() + 1) << " : "
		   << isPlusStrand(ali, refAss) << endl;
	  }
	  if (verbose > 2) {
	    REMARK << "Resulting alignment:" << endl;
	    cout << ali << endl;
	    ali.writeProperties(cout); 
	  }
	} else {
	  if (verbose > 2) {
	    REMARK << "Not using reverse complement of MAF block (internal representation) " << (size() + 1) << " : "
		   << isPlusStrand(ali, refAss) << endl;
	  }
	}
      } else if (strandMode == STRAND_UNKNOWN) {
	// do nothing
      } else {
	ERROR("Unknown strand mode!");
      }
      bool aliFilterCheck = prepareAlignment(ali, bed, refAssembly);
      bool aliFilterCheck2 = (ali.findSequenceByProperty("assembly", refAss) < ali.size()); // true if reference sequence part of ali
      if (aliFilterCheck && aliFilterCheck2) {
	for (SequenceAlignment::size_type jj = 0; jj < ali.size(); ++jj) {
	  string assemb = ali.getSequenceProperty(jj, "assembly");
	  assemblies.insert(assemb); 
	}
	if (removePropertiesMode) {
	  ali.removeAllProperties();
	}
	push_back(ali); // stores alignment block
	totalLength += ali.getLength();
	if (verbose > 2) {
	REMARK << verbose << " Alignment block " << size() << " with " << ali.size() 
	     << " sequences added. Line counter: " << pc << endl;
	}
      } else if (verbose > 2) {
	if (!aliFilterCheck) {
	  REMARK << "Ignoring alignment starting at " << ali.getName(0) << " because it is filtered by BED data." << endl;
	} 
	if (!aliFilterCheck2) {
	  REMARK << "Ignoring alignment starting at " << ali.getName(0) << " because it does not contain reference assembly: " 
		 << refAss << endl;
	}
      }
    } else {
      ++pc;
    }
  }
  ERROR_IF(size() == 0,
	   "MAF alignment could not be read. It is either due to a wrong format or a too high value of the minimum number of sequences per alignment block (option -s).");
  if (verbose > 1) {
    REMARK << "Checking if MAF blocks are properly sorted..." << endl;
  }
  if (!ContainerTools::isSorted(begin(), end())) {
    REMARK << "MAF blocks are not sorted in order of start residue indices of reference sequence. Starting to sort ... " << endl;
    if (verbose > 1) {
      for (size_type i = 0; i < size(); ++i) {
	cout << (*this)[i].getReferenceSequenceStartCol() << " "; 
      }
      cout << endl;
    }
    sort(begin(), end());
    ASSERT(ContainerTools::isSorted(begin(), end()));
  }
  if (!ContainerTools::isStrictlySorted(begin(), end())) {
    size_type removedBlocks = removeDuplicateStartIdBlocks();
    REMARK << "Removed " << removedBlocks << " MAF blocks because of identical start indices." << endl;
  }
  ERROR_IF(!ContainerTools::isStrictlySorted(begin(), end()),
	   "There are MAF blocks that start with the same nucleotide index for the reference sequence.");
  if (verbose > 1) {
    REMARK << "Starting updateAssemblyRowIdMaps..." << endl;
  }
  updateAssemblyRowIdMaps();
  if (verbose > 1) {
    REMARK << "Removing gaps of reference assembly ..." << endl;
  }
  collapseAssembly(refAss); // remove all gaps with respect to reference asssembly
  // there should be no assemblies with 0 length:
  for (size_type i = 0; i < size(); ++i) {
    ERROR_IF((*this)[i].getLength() == 0,
	     "Internal error in reading readMAF: MAF Block without reference sequence found.");
  }
  if (verbose > 1) {
    REMARK << "Pruning overlapping MAF blocks..." << endl;
  }
  size_type overlapCount = pruneOverlapping();
  if (verbose > 1) {
    REMARK << "Pruned " << overlapCount << " overlapping MAF blocks." << endl;
  }  
  // there should be no assemblies with 0 length:
  for (size_type i = 0; i < size(); ++i) {
    ERROR_IF((*this)[i].getLength() == 0,
	     "Internal error in reading readMAF after pruning overlapping MAF blocks: MAF Block without reference sequence found.");
  }
  updateTotalLength();
  if (verbose > 1) {
    REMARK << "Generating internal alignment indices..." << endl;
    REMARK << "Starting updateColumnIds..." << endl;
  }
  updateColumnIds();
  if (verbose > 1) {
    REMARK << "Starting updateChromStarts..." << endl;
  }
  updateChromStarts();
  // if (verbose > 1) {
  // REMARK << "Starting updateRefAssembly" << endl;
  // }
  // updateRefAssembly();
  // updateAssemblyToColumnMapping();
  //  ASSERT(validate());
  if (verbose > 0) {
    REMARK << "Finished " << methodName << endl;
  }
}

/** Reads MAF format file, writes out blocks that pass the filter */
MAFAlignment::size_type
MAFAlignment::filterMAF(istream& is, ostream& os, const BEDRegions& bed) const {
  ASSERT(getRefAssembly().size() > 0);
  size_type writeCount = 0;
  string methodName = "readMaf";
  os << "##maf version=1" << endl; // metadata string in output
  if (verbose > 0) {
    REMARK << "Starting " << methodName << " with strand mode " << strandMode << endl;
  }

  string refAss = getRefAssembly();
  Vec<string> lines = getLines(is); // read all at same time! might be problematic! FIXIT
  if (verbose > 0) {
    REMARK << " Read " << lines.size() << " lines" << endl;
  }
  pc = 0; // reset line counter
  while (pc < lines.size()) {
    if ((lines[pc].size() > 3) && (!((lines[pc][0] == 'a') && (lines[pc][1] == ' ')))) { 
      // substr(lines[pc],1,2) != "a ") {
      ++pc;
      continue; // no point in invoking subroutine
    }
    alignment_imp_type ali = readAlignment(lines, pruneAfter, requiredAssemblies, tabooAssemblies);
    if (shuffleMode) {
      ali.randomShuffle();
    }
    if (ali.size() >= seqMin) {
      ali.upperCaseSequences();
      if (isPlusStrand(ali, refAss) == STRAND_UNKNOWN) {
	REMARK << "No strand direction specified for alignment: " << endl;
	cout << ali << endl;
	for (SequenceAlignment::size_type i = 0; i < ali.size(); ++i) {
	  cout << "Sequence " << (i+1) << endl;
	  SequenceAlignment::properties_type prop = ali.getSequenceProperties(i);
	  for (SequenceAlignment::properties_type::const_iterator it = prop.begin(); it != prop.end(); it++) {
	    cout << it->first << " : " << it->second << endl;
	  }
	}
      }
      if (strandMode == STRAND_PLUS) {
	if (isPlusStrand(ali, refAss) == STRAND_MINUS) {
	  ali.transformReverseComplement();
	  if (verbose > 0) {
	    REMARK << "Using reverse complement of MAF block (internal representation) " << (size() + 1) << " : " 
		   << isPlusStrand(ali, refAss) << endl;
	  } 
	  if (verbose > 2) {
	    REMARK << "Resulting alignment:" << endl;
	    cout << ali << endl;
	    ali.writeProperties(cout);
	  }
	} else if (verbose > 2) {
	  REMARK << "Not using reverse complement of MAF block (internal representation) " << (size() + 1) << " : " 
		 << isPlusStrand(ali, refAss) << endl;
	}
      } else if (strandMode == STRAND_MINUS) {
	if (isPlusStrand(ali, refAss) == STRAND_PLUS) {
	  ali.transformReverseComplement();
	  if (verbose > 0) {
	    REMARK << "Using reverse complement of MAF block (internal representation) " << (size() + 1) << " : "
		   << isPlusStrand(ali, refAss) << endl;
	  }
	  if (verbose > 2) {
	    REMARK << "Resulting alignment:" << endl;
	    cout << ali << endl;
	    ali.writeProperties(cout); 
	  }
	} else {
	  if (verbose > 2) {
	    REMARK << "Not using reverse complement of MAF block (internal representation) " << (size() + 1) << " : "
		   << isPlusStrand(ali, refAss) << endl;
	  }
	}
      } else if (strandMode == STRAND_UNKNOWN) {
	// do nothing
      } else {
	ERROR("Unknown strand mode!");
      }
      bool aliFilterCheck = prepareAlignment(ali, bed, refAssembly);
      bool aliFilterCheck2 = (ali.findSequenceByProperty("assembly", refAss) < ali.size()); // true if reference sequence part of ali
      if (aliFilterCheck && aliFilterCheck2) {
	++writeCount;
	writeMAFBlock(os, ali); 
	os << endl;
	if (removePropertiesMode) {
	  ali.removeAllProperties();
	}
	if (verbose > 2) {
	  REMARK << verbose << " Alignment block " << size() << " with " << ali.size() 
		 << " sequences passed filter. Line counter: " << pc << endl;
	}
      } else if (verbose > 2) {
	if (!aliFilterCheck) {
	  REMARK << "Ignoring alignment starting at " << ali.getName(0) << " because it is filtered by BED data." << endl;
	} 
	if (!aliFilterCheck2) {
	  REMARK << "Ignoring alignment starting at " << ali.getName(0) << " because it does not contain reference assembly: " 
		 << refAss << endl;
	}
      }
    } else {
      ++pc;
    }
  }
  if (verbose > 0) {
    REMARK << "Finished " << methodName << endl;
  }
  return writeCount;
}

/** Removes from pair of overlapping alignments the alignment with the lower score.
 *  Precondition : must already be sorted. */
MAFAlignment::size_type
MAFAlignment::removeDuplicateStartIdBlocks() {
  ASSERT(ContainerTools::isSorted(begin(), end())); // no more duplicates
  size_type count = 0;
  Vec<size_type> removeIds;
  size_type totCount = 0;
  do {
    count = 0;
    removeIds.clear();
    for (size_type i = 1; i < size(); ++i) {
      const SequenceAlignment& ali1 = (*this)[i-1];
      const SequenceAlignment& ali2 = (*this)[i];
      sequence_size_type firstIndex1 = ali1.getReferenceSequenceStartCol(); // 0-based start of first block
      sequence_size_type firstIndex2 = ali2.getReferenceSequenceStartCol(); // 0-based start of second block
      if (firstIndex1 == firstIndex2) { // lastIndex1 == firstIndex2 is still ok, no overlap
	if (ali1.getScore() <= ali2.getScore()) {
	  removeIds.push_back(i-1);
	} else {
	  removeIds.push_back(i);
	}
	++count;
      }
    }
    sort(removeIds.begin(), removeIds.end());
    reverse(removeIds.begin(), removeIds.end()); // remove highest indices first
    for (size_type i = 0; i < removeIds.size(); ++i) {
      size_type id = removeIds[i];
      if ((i == 0) || (id < removeIds[i-1])) {
	erase(begin() + id);
	++totCount;
      }
    }

  }
  while (count > 0);
  ASSERT(ContainerTools::isStrictlySorted(begin(), end())); // no more duplicates
  return totCount; // returns number of removed MAF blocks
}

/** Counts number of overlapping alignments.  */
MAFAlignment::size_type
MAFAlignment::countOverlapping() const {
  size_type count = 0;
  for (size_type i = 1; i < size(); ++i) {
    const SequenceAlignment& ali1 = (*this)[i-1];
    const SequenceAlignment& ali2 = (*this)[i];
    sequence_size_type firstIndex1 = ali1.getReferenceSequenceStartCol(); // 0-based start of first block
    sequence_size_type firstIndex2 = ali2.getReferenceSequenceStartCol(); // 0-based start of second block
    sequence_size_type lastIndex1 = firstIndex1 + ali1.getLength(); // 1-based end of first block
    if (lastIndex1 > firstIndex2) { // lastIndex1 == firstIndex2 is still ok, no overlap
      ++count;
    }
  }
  return count; // returns number of pruned alignments
}

/** Prunes overlapping alignments. Alignment with lower alignment score is being shortened.
 * If scores are equal, the alignment with lower index is shortened.
 */
MAFAlignment::size_type
MAFAlignment::pruneOverlapping() {
  size_type count = 0;
  for (size_type i = 1; i < size(); ++i) {
    SequenceAlignment& ali1 = (*this)[i-1];
    SequenceAlignment& ali2 = (*this)[i];
    sequence_size_type firstIndex1 = ali1.getReferenceSequenceStartCol(); // 0-based start of first block
    sequence_size_type firstIndex2 = ali2.getReferenceSequenceStartCol(); // 0-based start of second block
    sequence_size_type lastIndex1 = firstIndex1 + ali1.getLength(); // 1-based end of first block
    // sequence_size_type lastIndex2 = firstIndex2 + ali2.getLength(); // 1-based end of first block
    if (lastIndex1 > firstIndex2) { // lastIndex1 == firstIndex2 is still ok, no overlap
      sequence_size_type diff = lastIndex1 - firstIndex2; // has to be shortened this much
      //      if (ali1.getScore() <= ali2.getScore()) { // IMPROVE later
      if (true) {
	// shorten first alignment:
	ERROR_IF(ali1.getLength() < diff, "Internal error in readMAF: cannot prune overlapping alignment");
	sequence_size_type newLength = ali1.getLength() - diff;
	ASSERT(newLength >= 0);
	ERROR_IF(newLength == 0, "Error in readMAF: cannot prune overlapping alignment because it would lead to an empty MAF block.");
	if (verbose > 0) {
	  REMARK << "Pruning MAF block (first)" << (i) << " to new length: " << newLength << endl;
	}	
	ali1.prune(0, newLength);
      } else {
	// shorten second alignment: // currently not used because too risky for bugs: would have to update all those chromStart indices... 
	ERROR_IF(ali2.getLength() < diff, "Internal error in readMAF: cannot prune overlapping alignment");
	sequence_size_type newLength = ali2.getLength() - diff;
	ASSERT(newLength >= 0);
	ERROR_IF(newLength == 0, "Error in readMAF: cannot prune overlapping alignment because it would lead to an empty MAF block.");
	if (verbose > 0) {
	  REMARK << "Pruning MAF block (second)" << (i + 1) << " to new length: " << newLength << endl;
	}
	ali2.prune(diff, newLength);
      }
      ++count;
    }
  }
  ASSERT(countOverlapping() == 0);
  return count; // returns number of pruned alignments
}

/** Appends other MAF alignment to end of current object */
void
MAFAlignment::append(const MAFAlignment& other) {
  PRECOND(other.getRefAssembly() == getRefAssembly());
  totalLength += other.getTotalLength();
  insert(end(), other.begin(), other.end()); // central insert operation. All alignment blocks of other vector are copied, however
  chromStarts.insert(chromStarts.end(), other.chromStarts.begin(), other.chromStarts.end()); // append and do not modify chromstart info
  // the extra bookkeeping info has to still be updated:
  for (set<string>::const_iterator it = other.assemblies.begin(); it != other.assemblies.end(); it++) {
    if (assemblies.find(*it) == assemblies.end()) {
      assemblies.insert(*it); // compute union
    }
  }
  updateColumnIds();
  updateAssemblyRowIdMaps();
  // updateChromStarts();
  // updateAssemblyToColumnMapping();
}

/** Returns 1 is plus strand of reference assembly found, -1 if negative strand, 0 if not found or strand character other than "+" or "-" */
int
MAFAlignment::isPlusStrand(const SequenceAlignment& ali, const string& assembly) {
  ASSERT(assembly.size() > 0);
  int result = 0;
  string sPlus = "+";
  string sMinus = "-";
  SequenceAlignment::size_type id = ali.findSequenceByProperty("assembly", assembly);
  if (id < ali.size()) {
    string strandProp = ali.getSequenceProperty(id, "strand");
    if(strandProp == sPlus) { // if same string, strcmp returns zero
      ASSERT((strandProp.size() == 1) && (strandProp[0] == '+'));
      result = 1;
    } else if (strandProp == sMinus) {
      ASSERT((strandProp.size() == 1) && (strandProp[0] == '-'));
      result = -1;
    }
  } else {
    cout << "Could not find sequence with assembly name " << assembly << " in alignment!" << endl;
  }
  return result;
}

/** reads one MAF alignment block if current line starts with "a "
 *  internal use only. FIXIT: "const" is not desirable
 */
SimpleSequenceAlignment
MAFAlignment::readAlignment(const Vec<string>& lines, size_type pruneAfter,
			    const set<string>& requiredAssemblies, const set<string>& tabooAssemblies) const {
  ASSERT((pc >= 0) && (pc < lines.size()));
  // ASSERT(getRefAssembly().size() > 0);
  SimpleSequenceAlignment result;
  string refAssembly = getRefAssembly(); 
  set<string> sequenceSet; // used for checking for identical sequences
  if ((lines[pc].size() > 2) && (lines[pc][0] == 'a') && (lines[pc][1] == ' ')) {
    if (verbose > 2) {
      REMARK << "Detected new alignment block in line " << pc << ":" << lines[pc] << endl;
    }
    vector<string> words = getTokens(lines[pc], " ", false); // returns vector of words FIXIT: handle repeat spaces    
    ASSERT(words.size() == 2);
    vector<string> words2 = getTokens(words[1], "=", false);
    ERROR_IF(words2.size() != 2, "Error reading MAF block: Expected line of form: a score=VALUE");
    double score = stod(words2[1]);
    result.setScore(score);
    ++pc;
    // replace line with collapsed white space:
    // line = gsub('[[:space:]]+', ' ', lines[pc])
    //		line = lines[pc]
    while ((pc < lines.size()) && (lines[pc].size() > 5)) {
      if (verbose > 3) {
	REMARK << "Working on processed line" << (pc+1) << ":" << lines[pc] << endl;
      }
      char firstChar = lines[pc][0];
      if (firstChar == 'a') {
	if (result.size() > 0) {
	  break; // new alignment; stop
	}
      }
      else if (firstChar == 's') {
	if (verbose > 3) {
	  REMARK << "Recognized sequence entry!" << endl;
	}
	// read "s" lines containing sequence info:
	// here is an example line taken from UCSC FAQ:
	// s hg16.chr7    27707221 13 + 158545518 gcagctgaaaaca
	// this is mapped to the following list entries:
	// - desc	chrom	assembly	chromStart	length	strand	totLength	seq
	words = getTokens(lines[pc], " ", false);
	  // FIXIT : remove empty words! words = words[words != ""] // remove empty words
	if (words.size() < 7) {
	  REMARK << "Strange sequence descriptor in line" << pc << ":" << lines[pc] << endl;
	  break;
	}	
	string name = words[1];
	vector<string> descWords = getTokens(words[1], ".", false); // split with "." character such as hg18.chrom1 into hg18 and chrom1
	SequenceAlignment::properties_type entry; // new properties map (map<string, string>)
	if (descWords.size() >= 2) {
	  entry["assembly"] = descWords[0];
	  //	  assemblies.insert(descWords[0]);
	  entry["chrom"] = descWords[1];
          if ((descWords[0] == refAssembly) && (refAssembly.size() > 0)) {
	    if (refAssemblyChrom.size() == 0) {
	      refAssemblyChrom = descWords[1]; // set chromosome of reference assembly as first assembly found
	      refAssemblyTotLength = stol(words[5]);
	      ERROR_IF( ! (refAssemblyTotLength > 0),
			"Total length of reference assembly chromosome could not be read.");
	      
	    } else if (refAssemblyChrom != descWords[1]) {
	      ERROR("The MAF blocks of the reference assembly must all correspond to the same chromosome!");
	    }
	  }
	}
	if ((tabooAssemblies.size() > 0) && (descWords.size() > 0) && (tabooAssemblies.find(descWords[0]) != tabooAssemblies.end())) {
	  if (verbose > 1) {
	    REMARK << "Ignoring sequence due to taboo assembly: " << descWords[0] << " : line: " << lines[pc] << endl;
	  }
	  ++pc;	  
	  continue; // ignoring this assembly
	}
	if ((requiredAssemblies.size() > 0) && (descWords.size() > 0) && (requiredAssemblies.find(descWords[0]) == requiredAssemblies.end())) {
	  if (verbose > 1) {
	    REMARK << "Ignoring sequence due to required assembly: " << lines[pc] << endl;
	  }
	  ++pc;	  
	  continue; // ignoring this assembly
	}
	entry["chromStart"] = words[2]; // keep it zero-based; do not add one
	entry["length"] = words[3];
	entry["strand"] = words[4]; // FIXIT
	entry["totLength"] = words[5]; // problem
	string seq = words[6];
	if (verbose > 3) {
	  REMARK << "Adding sequence " << name << " : " << seq << endl;
	}
        set<string>::size_type setSize = sequenceSet.size();
        sequenceSet.insert(seq);
	if ((pruneAfter < 1) || (result.size() < pruneAfter) || (sequenceSet.size() > setSize)) {
	  result.addSequence(seq, name, entry); // add sequence, name, properties map
	} else {
	  if (verbose > 1) {
	    REMARK << "Not storing identical sequence " << name << " of length " << seq.size() << endl;
	  }
	}
	  
      } else if (firstChar == 'i') {
	// FIXIT
      } else if (firstChar == 'q') {
	// FIXIT
      } else if (firstChar == 'e') {
	// FIXIT
      }
      ++pc;
      //			if (pc <= length(lines)) {
      //				line = gsub('[[:space:]]+', ' ', lines[pc])
      //				line = lines[pc]
      //			}
    } // while
    // attr(result,"lastLine") = pc;
  } else if (verbose > 2) {
    REMARK << "Ignoring strange line:" << pc << lines[pc] << endl;
  }
  if (verbose > 1) {
    REMARK << "Read MAF block with " << result.size() << " ( " << sequenceSet.size() << " ) sequences and length "
	   << result.getLength() << endl;
  }
  return result;
}

/** writes MAF formatted file. Requires properties called "assembly", "chromStart", "length", "totLength" to be defined for each sequence.
    FIXIT: prettier formatting.
 */
void
MAFAlignment::writeMAF(ostream& os) const {
  os << "##maf version=1" << endl;
  for (size_type i = 0; i < size(); i++) {
    writeMAFBlock(os, (*this)[i]);
    os << endl;
  }
}

/** writes MAF formatted file. Requires properties called "assembly", "chromStart", "length", "totLength" to be defined for each sequence.
    FIXIT: prettier formatting.
 */
void
MAFAlignment::writeMAFBlock(ostream& os, const SequenceAlignment& ali) {
  os << "a score=" << ali.getScore() << endl;
  for (SequenceAlignment::size_type i = 0; i < ali.size(); ++i) {
    // REMARK << "Writing MAF block sequence " << (i+1) << endl;
    // REMARK << "Properties:" << endl;
    // ali.writeProperties(cout);
    string assembly = ali.getSequenceProperty(i, "assembly");
    string chromStart = ali.getSequenceProperty(i, "chromStart");
    string chrom = ali.getSequenceProperty(i, "chrom");
    string assemblyChrom = assembly;
    if (chrom.size() > 0) {
      assemblyChrom = assemblyChrom + "." + chrom;
    }
    string length = ali.getSequenceProperty(i, "length");
    string strand = ali.getSequenceProperty(i, "strand");
    string totLength = ali.getSequenceProperty(i, "totLength");
    // REMARK << assembly << " " << chromStart << " " << chrom << " " << assemblyChrom << endl;
    if ((assembly.size() > 0) && (chromStart.size() > 0) && (length.size() > 0) && (strand.size() == 1)
	&& (totLength.size() > 0)) {
      os << "s " << assemblyChrom << " " << chromStart << " " << length << " " << strand << " " << totLength << " "
	 << ali.getSequence(i) << endl;
    } else {
      cerr << "# Warning: cannot write sequence " << (i + 1) << " in alignment as MAF block because of missing properties." << endl;
    }
  }
}

void
MAFAlignment::readFASTA(istream& is) {
  int verbose = 0;
  if (verbose > 0) {
    REMARK << "Reading lines..." << endl;
  }
  clear(); // remove all content
  SimpleSequenceAlignment ali;
  ali.readFasta(is);
  ali.upperCaseSequences(); // convert all sequences to upper case
  BEDRegions dummyRegions;
  prepareAlignment(ali, dummyRegions, refAssembly); // adds chromStart and assembly to name; no referecence assembly given
  if (removePropertiesMode) {
    ali.removeAllProperties();  // FIXIT necessary?
  }
  push_back(ali); // store one alignment
  totalLength = ali.getLength();
  for (SequenceAlignment::size_type i = 0; i < ali.size(); ++i) {
    assemblies.insert(ali.getName(i)); // use sequence names as assembly names, assume all sequence names are different
  }
  // upperCaseSequences();
  updateColumnIds();
  updateAssemblyRowIdMaps();
  updateChromStarts();
  if (getRefAssembly().size() == 0) {
    updateRefAssembly();
  }
  // updateAssemblyToColumnMapping();

  ASSERT(validate());
}
