#include <HashCorrelationFinder.h>
#include <NucleotideTools.h>
#include <cctype>
#include <ContainerTools.h>
#include <iomanip>
#include <iterator>
#include <queue>
#include <RankedSolution3.h>
#include <RankedSolution5.h>
#include <CorrelationTools.h>
#include <MAFAlignmentTools.h>

/** Check it i,j co-occurs with i-1,j+1 or i+1,j-1. Makes sense for RNA helix interactions */
set<HashCorrelationFinder::length_type>
HashCorrelationFinder::filterNeighborCorrelations(const set<length_type>& leftSet, const set<length_type>& middleSet, const set<length_type>& rightSet) {
  set<length_type> result;
  for (set<length_type>::const_iterator it = middleSet.begin(); it != middleSet.end(); it++) {
    if ((leftSet.find((*it) + 1) != leftSet.end()) || (rightSet.find((*it) - 1) != rightSet.end())) {
      result.insert(*it);
    }
  }
  return result;
}

/** Returns true, if columns i,j were found to be correlated.  */
bool
HashCorrelationFinder::isCorrelationFound(length_type i, length_type j) const {
  // PRECOND(ContainerTools::isSorted((*resultBins)[i]->begin(), (*resultBins)[i]->end());
  return ((i >= 0) && (i < static_cast<length_type>(resultBins->size())) && ((*resultBins)[i] != 0)
	  && (find((*resultBins)[i]->begin(), (*resultBins)[i]->end(), j) != (*resultBins)[i]->end()));
	  //     Vec<length_type>::const_iterator it = lower_bound((*resultBins)[i]->begin(), (*resultBins)[i]->end(), j);
	  //     if ((it != (*resultBins)[i]->end()) && ((*it) == j)) {
	  //       return true;
	  //     }
	  //   } 
	  //   return false;
}

/** Returns true if correlation is part of stem of at least length 3 */
bool
HashCorrelationFinder::isCorrelationIsolated3(length_type i, length_type j) const {
  PRECOND(isCorrelationFound(i, j));
  bool notIsolated = (isCorrelationFound(i-1,j+1) && (isCorrelationFound(i+1,j-1) || isCorrelationFound(i-2,j+2)))
    || (isCorrelationFound(i+1,j-1) && (isCorrelationFound(i+2,j-2) || isCorrelationFound(i-1,j+1)));
  return !notIsolated;
}

/** Filters resultBins datastructure for one column. Works only if columns i-1 and i+1 are defined already */
void
HashCorrelationFinder::filterIsolatedCorrelation3(length_type i) const {
  PRECOND((i > 0) && (maf != 0) && (resultBins != 0) && ((i + 1) < maf->getTotalLength()) );
  if ((*resultBins)[i] == 0) {
    return ; // nothing to do
  }
  if ( ((*resultBins)[i+1] == 0) && ((*resultBins)[i-1] == 0) ) {
    delete( (*resultBins)[i] );
    (*resultBins)[i] = 0; 
  }
  else {
    result_vector_type * colp = (new result_vector_type());
    // for (size_t j = 0; j < (*resultBins)[i]->size(); ++j) {
    for (result_vector_type::const_iterator jp = (*resultBins)[i]->begin(); jp != (*resultBins)[i]->end(); ++jp) {
      length_type val = *jp; //  (*(*resultBins)[i])[j];
      ASSERT(isCorrelationFound(i, val)); 
      if (!isCorrelationIsolated3(i, val)) {
	// (*resultBins)[i].erase((*resultBins)[i].begin() + j);
	colp -> push_back(val);
	// ASSERT(!correlationFound(i, val)); 
      }
    }
    delete( (*resultBins)[i] );
    (*resultBins)[i] = colp;
  }
}

/** Collects and returns results; */
HashCorrelationFinder::result_container
HashCorrelationFinder::getResults() const {
  PRECOND((resultBins != 0) && (static_cast<length_type>(resultBins->size()) == maf->getTotalLength()));
  Vec<Correlation> results;
  // size_type numFound = 0;
  for (Vec<Vec<length_type> >::size_type i = 0; i < resultBins->size(); ++i) {
    if ((*resultBins)[i] != 0) {
      sort((*resultBins)[i]->begin(), (*resultBins)[i]->end()); // sorting not necessary for find algorithm in isCorrelationIsolated3
      // numFound += (*resultBins)[i]->size();
    }
  }
  // results.reserve(numFound);
  for (size_type i = 0; i < resultBins->size(); ++i) {
    if ((*resultBins)[i] != 0) {
      for (result_vector_type::const_iterator jp = (*resultBins)[i]->begin();  jp != (*resultBins)[i]->end(); ++jp) {
	if ((!removeIsolated) || (!isCorrelationIsolated3(i, *jp))) {
	  if ((jp == (*resultBins)[i]->begin()) || (*(jp-1) != *jp)) {
	    results.push_back(Correlation(i,*jp)); // at most 2 copies of each correlation are expected
	  }
	}
      }
    }
  }
  // sort(results.begin(), results.end());
  // Vec<Correlation> finalResults;
  // finalResults.reserve(results.size());
  // unique_copy(results.begin(), results.end(), back_inserter(finalResults)); // remove duplicates
  return results;
}

/** Returns column ids of MAF columns that are compatible with the given column-assembly search.
 * Warning: the result can contain values that are not complementary. */
set<HashCorrelationFinder::length_type> 
HashCorrelationFinder::searchMafColumn(const string& column, const Vec<string>& colAssemblies,
				       length_type colid) const {
  PRECOND(column.size() == colAssemblies.size());
  PRECOND(column.size() > 1);
  // map<string, set<length_type> >::size_type bestSize = positionHashes.begin()->second.size();
  // string bestHash = ""; // positionHashes.begin() -> first;
  Vec<RankedSolution3<string> > queue;
  queue.reserve((column.size() * (column.size() - 1)) / 2);
  for (string::size_type i = 0; i < column.size(); ++i) {
    char c1 = toupper(column[i]);
    if (NucleotideTools::isGap(c1)) {
      continue;
    }
    ASSERT(c1 != 'U'); // only DNA allowed now
    for (string::size_type j = (i+1); j < column.size(); ++j) {
      char c2 = toupper(column[j]);
      if (NucleotideTools::isGap(c2)) {
	continue;
      }
      ASSERT(c2 != 'U'); // only DNA allowed now
      if (c1 == c2) { // ignore searches for conserved nucleotides; there are too many!
	continue;
      }
      ASSERT(colAssemblies[i].size() > 0);
      ASSERT(colAssemblies[j].size() > 0);
      string hashhash = MAFSearchTables::createHashTableHash(colAssemblies[i], colAssemblies[j], c1, c2);
      map<string, compressed_type >::const_iterator hashIt = tables->findPositionHash(hashhash);
      // tables->positionHashes.find(hashhash);
      if (hashIt != tables->positionHashes.end()) {
	map<string, set<length_type> >::size_type sz = hashIt->second.size(); // FIXIT: this size is distorted due to compression
	queue.push_back(RankedSolution3<string>(static_cast<double>(sz), hashhash));
      }
    }
  }
  sort(queue.begin(), queue.end()); // "best" position set to use first is the smallest one
  Vec<string> queueNames(queue.size());
  for (size_type i = 0; i < queue.size(); i++) {
    queueNames[i] = queue[i].second;
  }
  Vec<length_type> result, tmpResult;
  set<length_type> finalResult;
  if (queue.size() < 1) {
    if (verbose > 1) {
      cout << "Ignored column: " << (colid + 1) << " " << column << " Strange, no hash sets found." << endl;
    }
    // ++ignoredCount; // THREADISSUE ?
    return finalResult;
  }
  ASSERT(queue.size() > 0);
  string bestName = queue[0].second; // choose smallest index set first
  // ASSERT(positionHashes.find(queue[0].second) != positionHashes.end()); // ensure hash table is found
  const compressed_type& bestSet = tables->getSet(bestName); // findPositionHash(queue[0].second) -> second;
  // compressed_type::iterator bestSetLowerBoundIt = lower_bound(bestSet.begin(), bestSet.end(), colid);
  result.reserve(bestSet.size());
  string secondBestName;
  if (queue.size() > 1) {
    size_type secondBestMax = 4; // make decision about which set to choose next among this many alternatives (minus one)
    size_type searchSize = queueNames.size();
    if (searchSize > secondBestMax) {
      searchSize = secondBestMax;
    }
    secondBestName = tables->findNextSetByName(bestName, queueNames.begin()+1, queueNames.begin() + searchSize); // find best matching partner!
    // used strategy: two different assemblies as short as possible
    ASSERT(secondBestName.size() > 0);
    const compressed_type& secondBestSet = tables->getSet(secondBestName); 
    ASSERT(bestSet.size() <= secondBestSet.size()); // whole point is to sort by set size. 
    ASSERT(result.size() == 0);
    string doubleHash = MAFSearchTables::createDoubleHash(bestName, secondBestName);
#ifdef COVARNA_CONCURRENT_VECTOR
    intersection_cache_type::const_accessor accRead; // read access
    if (! intersectionCache.find(accRead, doubleHash)) {
      accRead.release();
      set_intersection(bestSet.begin(), bestSet.end(), secondBestSet.begin(), secondBestSet.end(), back_inserter(result));
      if (result.size() <= CACHE_ELEMENT_SIZE_MAX) {
	// we need write access now, even though it is possible that the same set has just been written before by another thread:
	intersection_cache_type::accessor accWrite;
	intersectionCache.insert(accWrite, doubleHash);
	accWrite->second = result;
	accWrite.release();
	if (verbose > 4) {
	  cout << "Storing " << doubleHash << " : " << result.size() << endl;
	}
      }
    } else { // container found!
      result = accRead -> second;
      accRead.release();
      if (verbose > 4) {
	cout << "Retrieved " << doubleHash << " : " << result.size() << endl;
      }
    }
#else
    // intersection_cache_type::const_accessor accRead; // read access
    if (! intersectionCache.find(doubleHash)) {
      // accRead.release();
      set_intersection(bestSet.begin(), bestSet.end(), secondBestSet.begin(), secondBestSet.end(), back_inserter(result));
      if (result.size() <= CACHE_ELEMENT_SIZE_MAX) {
	// we need write access now, even though it is possible that the same set has just been written before by another thread:
	intersection_cache_type::accessor accWrite;
	intersectionCache.insert(doubleHash);
	// accWrite->second = result;
	// accWrite.release();
	if (verbose > 4) {
	  cout << "Storing " << doubleHash << " : " << result.size() << endl;
	}
      }
    } else { // container found!
      result = accRead -> second;
      accRead.release();
      if (verbose > 4) {
	cout << "Retrieved " << doubleHash << " : " << result.size() << endl;
      }
    }
#endif
    // tables->setIntersectionSize(bestName, secondBestName, result.size()); // FIXIT test lowerBound ?
  } else {
    if (bestSet.size() <= searchColumnMax) {
      // compressed_type::const_iterator bestSetLowerBoundIt = lower_bound(bestSet.begin(), bestSet.end(), colid);
      for (compressed_type::iterator it = bestSet.begin(); it != bestSet.end(); it++ ) {   
	result.push_back(*it);
      }
    } else {
      if (verbose > 0) {
	cout << "Ignored column: " << (colid + 1) << " " << column << " Too many possible columns to be searched in only one set ( " << bestSet.size() << " )" << endl; 
      }
      // ++ignoredCount;
      return finalResult;
    }
  }
  // ASSERT(ContainerTools::isSorted(bestSet.begin(), bestSet.end()));
  // ASSERT(ContainerTools::isSorted(secondBestSet.begin(), secondBestSet.end()));
  // cout << "Copied smallest found set: " << queue[0].second << " " << result.size() << endl;
  if (result.size() > 0) {
    tmpResult.reserve(result.size());
    for (Vec<RankedSolution3<string> >::size_type i = 1; i < queue.size(); ++i) {
      if (queue[i].second == secondBestName) {
	continue; // do not use this set again
      }
      // cout << "Working on set " << queue.top().second << " " << queue.size() << " "<< result.size() << endl;
      // ASSERT(positionHashes.find(queue[i].second) != positionHashes.end()); // ensure hash table is found
      tmpResult.clear();
      // ASSERT((result.size() > 0) && (*(result.begin()) >= colid)); // has already minimum size
      const compressed_type& nextBestSet = tables->getSet(queue[i].second);  // findPositionHash(queue[i].second) -> second;
      // ASSERT(ContainerTools::isSorted(otherSet.begin(), otherSet.end()));
      set_intersection(result.begin(), result.end(), nextBestSet.begin(), nextBestSet.end(), back_inserter(tmpResult));
      // ContainerTools::setIntersection(result.begin(), result.end(), otherSet, tmpResult);
      result = tmpResult;
      // ASSERT(ContainerTools::isSorted(result.begin(), result.end()));
      if (result.size() == 0) {
	break;
      }
    }
  }
  if (result.size() <= searchColumnMax) {
    for (Vec<length_type>::const_iterator it = result.begin(); it != result.end(); it++) {
      //       if ((*it) <= colid) {
      // 	continue; // FIXIT : why can this happen?
      //       }
      // ASSERT(*it > colid);
      ASSERT(colid >= 0 && colid < maf->getTotalLength());
      ASSERT(*it >= 0 && *it < maf->getTotalLength());
      if (MAFAlignmentTools::isColumnPairComplementary(*maf, colid, *it, allowGu, allowGap)) {
	finalResult.insert(*it); // ok to insert Correlation(highIndex, lowIndex) as well as Correlation(lowIndex, highIndex)
      } else {
	if (verbose > 2) {
	  cout << "Warning: a non-complementary column pair was initially found but later thrown out: "
	       << maf->getSlice(colid) << " " << (colid + 1) << endl;
	  if (tables->getAssemblyPairFraction() == 1.0) {
	    // this can still legally happen for example in the case of present gaps 
	    // like chrY alignment pair columns: 263180 274356
	    // -AAAATAAAAAAAN ATTTTATTTTTTTT anoCar1 bosTau3 canFam2 danRer4 equCab1 galGal3 gasAcu1 hg18 mm8 monDom4 ornAna1 panTro2 rheMac2 rn4
	    MAFAlignmentTools::writeCommonColumnPair(cout, *maf, colid, *it);
	    // 	  ERROR(  "Internal error: did not expect to find non-complementary alignment pair columns: " + itos(colid + 1)
	    // 		   + " " + itos((*it) + 1));
	  }
	}
      }
    }
  } else {
    if (verbose > 0) {
      cout << "Ignored column: " << (colid + 1) << " " << column << " Too many possible columns to be searched ( " << result.size() << " )" << endl; 
    }
    // ++ignoredCount;
  }
  return finalResult;
}

/** Returns natural logarithm of  probability of a particular stem to be found at a particular position. Multiply with number of possible positions
 * to obtain E-value (either (N*(N-1))/2 for one MAF alignment (N == totalLength), or N*M for two MAF alignments */ 
double
HashCorrelationFinder::computeLogPValue(const Stem& stem) const {
  ASSERT(false);
  ERROR_IF(false, "Sorry, computing of p-values not supported for this finder (HashCorrelationFinder)" );
  double result = 0.0;
  // find number of columns for two positions:
  length_type i = stem.getStart();
  length_type j = stem.getStop();
  ASSERT(i != j);
  length_type aliId1 = maf -> getAlignmentId(i);
  length_type aliId2 = maf -> getAlignmentId(j);
  /** Returns set of assemblies, that are in common between two alignments */
  set<string> commonAssemblies = maf->getCommonAssemblies(aliId1,aliId2);
  string slice1 = maf-> getSlice(i, commonAssemblies);
  string slice2 = maf-> getSlice(j, commonAssemblies);
  Vec<string> assemblyVec(commonAssemblies.size());
  set<string>::const_iterator it2 = commonAssemblies.begin();
  for (Vec<string>::iterator it = assemblyVec.begin(); it != assemblyVec.end(); it++, it2++) {
    *it = *it2;
  }
  size_type n1 = searchMafColumn(NucleotideTools::dnaComplement(slice1), assemblyVec, i).size();
  size_type n2 = searchMafColumn(NucleotideTools::dnaComplement(slice2), assemblyVec, j).size();
  if (n1 == 0) {
    cout << "# Strange error in slice-1 of p-value computation: " << stem << " " << slice1 << " " << slice2 << " " << assemblyVec << " : " << n1 << endl;
    return 0.0;
  }
  if (n2 == 0) {
    cout << "# Strange error in slice-2 of p-value computation: " << stem << " " << slice1 << " " << slice2 << " " << assemblyVec << " : " << n2 << endl;
    return 0.0;
  }
  ASSERT(n1 > 0); // if nothing complementary, how can it be part of a stem?
  ASSERT(n2 > 0);
  double logp1 = log(static_cast<double>(n1)/static_cast<double>(maf->getTotalLength()));
  double logp2 = log(static_cast<double>(n2)/static_cast<double>(maf->getTotalLength()));
  result = 0.5 * (logp1 + logp2); // corresonds to *geometric* mean of probabilities
  if (stem.getLength() > 1) {
    result += computeLogPValue(Stem(stem.getStart() + 1, stem.getStop()-1, stem.getLength()-1)); // recursive
  }
  return result;
}

/** Returns natural logarithm of  probability of a particular stem to be found at a particular position. Multiply with number of possible positions
 * to obtain E-value (either (N*(N-1))/2 for one MAF alignment (N == totalLength), or N*M for two MAF alignments */ 
double
HashCorrelationFinder::computeForwardLogPValue(const Stem& stem) const {
  ERROR_IF(false, "Sorry, computing of p-values not supported for this finder (HashCorrelationFinder)" );
  double result = 0.0;
  // find number of columns for two positions:
  length_type i = stem.getStart();
  length_type j = stem.getStop();
  ASSERT(i != j);
  length_type aliId1 = maf -> getAlignmentId(i);
  length_type aliId2 = maf -> getAlignmentId(j);
  /** Returns set of assemblies, that are in common between two alignments */
  set<string> commonAssemblies = maf->getCommonAssemblies(aliId1,aliId2);
  string slice1 = maf-> getSlice(i, commonAssemblies);
  string slice2 = maf-> getSlice(j, commonAssemblies);
  Vec<string> assemblyVec(commonAssemblies.size());
  set<string>::const_iterator it2 = commonAssemblies.begin();
  for (Vec<string>::iterator it = assemblyVec.begin(); it != assemblyVec.end(); it++, it2++) {
    *it = *it2;
  }
  size_type n1 = searchMafColumn(NucleotideTools::dnaComplement(slice1), assemblyVec, i).size();
  size_type n2 = searchMafColumn(NucleotideTools::dnaComplement(slice2), assemblyVec, j).size();
  if (n1 == 0) {
    cout << "# Strange error in slice-1 of p-value computation: " << stem << " " << slice1 << " " << slice2 << " " << assemblyVec << " : " << n1 << endl;
    return 0.0;
  }
  if (n2 == 0) {
    cout << "# Strange error in slice-2 of p-value computation: " << stem << " " << slice1 << " " << slice2 << " " << assemblyVec << " : " << n2 << endl;
    return 0.0;
  }
  ASSERT(n1 > 0); // if nothing complementary, how can it be part of a stem?
  ASSERT(n2 > 0);
  double logp1 = log(static_cast<double>(n1)/static_cast<double>(maf->getTotalLength()));
  double logp2 = log(static_cast<double>(n2)/static_cast<double>(maf->getTotalLength()));
  result = 0.5 * (logp1 + logp2); // corresonds to *geometric* mean of probabilities
  if (stem.getLength() > 1) {
    result += computeLogPValue(Stem(stem.getStart() + 1, stem.getStop() + 1, stem.getLength()-1)); // recursive
  }
  return result;
}
