// --*- C++ -*------x---------------------------------------------------------
// $Id: KnnNode.cc,v 1.1.1.1 2006/07/03 14:43:21 bindewae Exp $
//
// Class:           SequencePairCluster
// 
// Base class:      -
//
// Derived classes: - 
//
// Author:          Eckart Bindewald
//
// Project name:    -
//
// Date:            $Date: 2006/07/03 14:43:21 $
//
// Description:     - 
// -----------------x-------------------x-------------------x-----------------

#include <KnnNode.h>
#include <clusterAlgorithms.h>
#include <StringTools.h>
#include <Random.h>
#include <vectornumerics.h>
#include <math.h> // for exp function

// ---------------------------------------------------------------------------
//                                   SequencePairCluster
// -----------------x-------------------x-------------------x-----------------

/* CONSTRUCTORS */

/* default constructor */
KnnNode::KnnNode() : clusterCutoff(0.1), clusterCutoff2(0.01), gaussDev(0.0), kk(10), numClasses(2), 
		     verboseLevel(1), noSelfMode(false), statisticsRunning(false)
{
}

/* copy constructor */
KnnNode::KnnNode(const KnnNode& other)
{
  copy(other);
}

/* destructor */
KnnNode::~KnnNode() { }


/* OPERATORS */

/** Assigment operator. */
KnnNode& 
KnnNode::operator = (const KnnNode& orig)
{
  if ((&orig) != this) {
    copy(orig);
  }
  return *this;
}

ostream& 
operator << (ostream& os, const KnnNode& rval)
{
  ERROR("Ouput operator not yet implemented!");
  return os;
}

istream& 
operator >> (istream& is, KnnNode& rval)
{
  ERROR("Input operator not yet implemented!");
  return is;
}

/* PREDICATES */

Vec<double> 
KnnNode::predictClassProb(const Vec<double>& v) const 
{
  ERROR_IF(!isValid(), "KnnNode not sufficiently defined for prediction!");
  ERROR_IF(v.size() != data[0].size(),
	   "Incompatible size of knn input vector!");
  unsigned int kkUse = kk;
  unsigned int nStart = 0; // start with nearest neighbor
  double gaussDev2 = gaussDev * gaussDev; // use square
  if (noSelfMode) {
    ++kkUse; // use one more nearest neighbor, but ignore closest one, it might be hit against itself
    ++nStart; // when noSelfMode is true, ignore closest neighbor
  }
  // Vec<unsigned int> nearest = kNearestNeighbors(v, data, kkUse, scaling);
  //   Vec<unsigned int> nearest = kNearestNeighbors(v, data, kkUse, scaling,
  // 						clusters, clusterCutoff);
  Vec<unsigned int> nearest = kNearestNeighbors(v, data,
						clustData, kkUse, scaling,
						clusters, 
		subClusters, clusterCutoff, clusterCutoff2);
  Vec<double> result(numClasses, 0.0);
  if (verboseLevel > 4) {
    cout << "KnnNode:: Nearest neighbors of " << v << endl;
    for (unsigned int i = 0; i < nearest.size(); ++i) {
      cout << i << " " << nearest[i] << "  " 
	   << dataClasses[nearest[i]] << "  " << data[nearest[i]] << endl;
    }
    cout << "KnnNode: nStart, kkUse: " << nStart << " " << kkUse 
	 << " " << verboseLevel << endl;
  }
  ERROR_IF(nearest.size() == 0, "No knn nearest neighbors found!");
  double addTerm = 1.0 / static_cast<double>(nearest.size()-nStart);
  double dist = 0.0;
  if (gaussDev <= 0.0) {
    for (unsigned int i = nStart; i < nearest.size(); ++i) {
      unsigned int cl = dataClasses[nearest[i]];
      ERROR_IF(cl >= result.size(), "Class id larger than excepted!");
      result[cl] += addTerm;
    }
    for (unsigned int i = 0; i < result.size(); ++i) {
      if (result[i] > 1.0) {
	ASSERT(result[i] < 1.1); // only rounding errors allowed
	result[i] = 1.0;
      }
    }
  }
  else { // if gaussian weighting activated
    for (unsigned int i = nStart; i < nearest.size(); ++i) {
      unsigned int cl = dataClasses[nearest[i]];
      dist = euclidianDistanceSquare(v, data[nearest[i]], scaling);
      ERROR_IF(cl >= result.size(), "Class id larger than excepted!");
      result[cl] += addTerm * exp(-dist/gaussDev2); // gaussian weighting
    }
    probabilityNormalize(result); // normalize such that sum of values is 1.0
  }
  if (verboseLevel > 4) {
    cout << "KnnNode: result: " << result << " End of predictClassProb for " << v << endl;
  }
  // cache values:
  lastPrediction = result;
  return result;
}

Vec<double> 
KnnNode::predictClassProb(const Vec<double>& v, unsigned int knownClass) const 
{
  Vec<double> result = predictClassProb(v);
  if (statisticsRunning) {
    updateStatistics(result, knownClass);
  }
  return result;
}

/** returns prediction accuracy using leave one out estimation (numTrial times) */
Vec<unsigned int>
KnnNode::initEstimateAccuracy() const
{
  // cout << "Starting initEstimateAccuracy!" << endl;
  unsigned int maxSize = 250000;
  if (data.size() < maxSize) {
    maxSize = data.size();
  }
  // return random number subset of maxSize entries of data:
  return generateRandomIndexSubset(maxSize, data.size(), 0);
  /*
    // this part of code was a good idea, but unfortunately it skews the data distribution
  Random& rnd = Random::getInstance();
  unsigned int oneCounter = 0;
  // count number of entries for class one
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == 1) {
      ++oneCounter;
    }
  }
  // numbers of entries for class zero:
  unsigned int otherSize = 500;
  if (otherSize > dataClasses.size()) {
    otherSize = dataClasses.size();
  }
  Vec<unsigned int> result((2*oneCounter)+otherSize, 0U);
  unsigned int pc = 0;
  // add entrie of class zero, that is closest to class one:
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == 1) {
      result[pc++] = i;
      // find closes vector which belongs to class zero:
      double dBest = 1e30;
      unsigned int dBestIdx = 0;
      double d;
      for (unsigned int j = 0; j < dataClasses.size(); ++j) {
	if (dataClasses[j] == 1) {
	  continue;
	}
	d = euclidianDistanceSquare(data[j], data[i]);
	if (d < dBest) {
	  dBest = d;
	  dBestIdx = j;
	}
      }
      result[pc++] = dBestIdx;
    }
  }
  for (unsigned int i = 0; i < otherSize; ++i) {
    result[pc++] = rnd.getRand(dataClasses.size());
  }
  */
  // ERROR_IF(pc != result.size(), "Internal error in line 139!");
  // cout << "Ending initEstimateAccuracy!" << endl;
  // return result;
}

/** returns prediction accuracy using leave one out estimation (numTrial times) */
double
KnnNode::estimateAccuracy(unsigned int numTrials) const
{
  if (!isValid()) {
    cout << "KnnNode not sufficiently defined for accuracy estimation!"
	 << endl;
    return -1000.0;
  }
  // cout << "Starting estimateAccuracy!" << endl;
  if (estimateSet.size() == 0) {
    estimateSet = initEstimateAccuracy();
  }
  unsigned int correctPredCounter = 0;
  unsigned int tp = 0;
  unsigned int fp = 0;
  unsigned int tn = 0;
  unsigned int fn = 0;
  for (unsigned int ii = 0; ii < estimateSet.size(); ++ii) {
    unsigned int j = estimateSet[ii]; // use j'th training vector as query
    // cout << "Testing " << ii << " " << j << endl;
    ERROR_IF(j >= dataClasses.size(), "Internal error in line 155!");
    ERROR_IF(j >= data.size(), "Internal error in line 157!");
    const Vec<double>& v = data[j];
    // Vec<unsigned int> nearest = kNearestNeighbors(v, data, kk+1, scaling); // kk+1: later ignore hit with itself
//     Vec<unsigned int> nearest = kNearestNeighbors(v, data, kk+1, scaling,
//  						  clusters, clusterCutoff);
    Vec<unsigned int> nearest = kNearestNeighbors(v, data, clustData, 
						  kk+1, scaling,
			  clusters, subClusters,
				  clusterCutoff, clusterCutoff2);
    Vec<double> result(numClasses, 0.0);
    double addTerm = 1.0 / static_cast<double>(nearest.size()-1.0);
    double dist = 0.0;
    if (gaussDev <= 0.0) {
      for (unsigned int i = 0; i < nearest.size(); ++i) {
	if (nearest[i] == j) {
	  continue; // ignore hit with "self"
	}
	unsigned int cl = dataClasses[nearest[i]];
	ASSERT(cl < result.size());
	result[cl] += addTerm;
      }
    }
    else { // if gaussian weighting is switched on
      for (unsigned int i = 0; i < nearest.size(); ++i) {
	if (nearest[i] == j) {
	  continue; // ignore hit with "self"
	}
	unsigned int cl = dataClasses[nearest[i]];
	dist = euclidianDistance(v, data[nearest[i]]);
	ASSERT(cl < result.size());
	result[cl] += addTerm * exp(-dist/gaussDev); // gaussian weighting
      }
      probabilityNormalize(result); // normalize such that sum of values is 1.0
    }
    // find highest class:
    // cout << "Class result counting: " << result << endl;
    unsigned int maxIndex = findMaxIndex(result);
    if (maxIndex == dataClasses[j]) {
      ++correctPredCounter; // correct prediction!
      if (dataClasses[j] == 1) {
	++tp;
      }
      else {
	++tn;
      }
    }
    else {
      if (dataClasses[j] == 1) {
	++fn;
      }
      else {
	++fp;
      }
    }
    // cout << result << " End of predictClassProb for " << v << endl;
  }  
  // cout << "Ending estimateAccuracy!" << endl;
  
  //   return static_cast<double>(correctPredCounter)/static_cast<double>(estimateSet.size());
  return computeMathews(tp, fp, tn, fn);
}

/** returns data rows which belong to class dataClass */
Vec<Vec<double> >
KnnNode::getData(unsigned int dataClass) const
{
  // count number of entries of that class:
  unsigned int n = count(dataClasses.begin(), dataClasses.end(),
			 dataClass);
  Vec<Vec<double> > result(n);
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == dataClass) {
      result[pc++] = data[i];
    }
  }
  return result;
}

/** returns indices of data rows which belong to class dataClass */
Vec<unsigned int>
KnnNode::getDataIndices(unsigned int dataClass) const
{
  // count number of entries of that class:
  unsigned int n = count(dataClasses.begin(), dataClasses.end(),
			 dataClass);
  Vec<unsigned int> result(n);
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == dataClass) {
      result[pc++] = i;
    }
  }
  return result;
}

Vec<Vec<double> >
KnnNode::getUsageHistogram() const {
  Vec<Vec<double> > hist(numClasses, Vec<double>(NUM_STAT_BINS, 0.0));
  if (trueCount.size() != numClasses) {
    return hist;
  }
  for (unsigned int i = 0; i < numClasses; ++i) {
    if (trueCount[i].size() != NUM_STAT_BINS) {
      return hist;
    }
    for (unsigned int j = 0; j < NUM_STAT_BINS; ++j) {
      if (falseCount[i][j] > 0) {
	hist[i][j] = static_cast<double>(trueCount[i][j])
	  / static_cast<double>(falseCount[i][j]);
      }
    }
  }
  return hist;
}

void
KnnNode::writeData(ostream& os) const
{
  for (unsigned int i = 0; i < data.size(); ++i) {
    for (unsigned int j = 0; j < data[i].size(); ++j) {
      os << data[i][j] << " ";
    }
    os << dataClasses[i] << endl;
  }
}

/* MODIFIERS */

/* copy method */
void 
KnnNode::copy(const KnnNode& other)
{
  estimateSet = other.estimateSet;
  clusterCutoff = other.clusterCutoff;
  clusterCutoff2 = other.clusterCutoff2;
  gaussDev = other.gaussDev;
  kk = other.kk;
  numClasses = other.numClasses;
  verboseLevel = other.verboseLevel;
  scaling = other.scaling;
  data = other.data;
  lastPrediction = other.lastPrediction;
  dataClasses = other.dataClasses;
  clustData = other.clustData;
  clusters = other.clusters;
  subClusters = other.subClusters;
  noSelfMode = other.noSelfMode;
  statisticsRunning = other.statisticsRunning;
  trueCount = other.trueCount;
  falseCount = other.falseCount;
}

/** read input data */
void
KnnNode::readData(istream& is,
		  unsigned int startCol,
		  unsigned int endCol,
		  unsigned int classCol)
{
  if (verboseLevel > 1) {
    cout << "KnnNode: Starting readData (1)!" << endl;
  }
  string line;
  ERROR_IF(endCol <= startCol, "Inconsistent definition of start and end column for readData!");
  unsigned int dim = endCol - startCol;
  Vec<double> dataVec(dim, 0.0);
  unsigned int numClasses = 1;
  unsigned int highestClass = 0; // highest defined class id so far
  while (is) {
    line = getLine(is);
    vector<string> tokens = getTokens(line);
    for (unsigned int i = startCol; i < endCol; ++i) {
      ERROR_IF(i >= tokens.size(), "Undefined data column!");
      dataVec[i-startCol] = stod(tokens[i]);
    }
    ERROR_IF(classCol >= tokens.size(), "Undefined class column!");
    unsigned int dataClass = static_cast<unsigned int>(stod(tokens[classCol]) + 0.499);
    if (dataClass > highestClass) {
      highestClass = dataClass;
      numClasses = highestClass + 1; // counting starts from zero
    }
    
    data.push_back(dataVec);
    scaling = Vec<double>(data[0].size(), 1.0);
    dataClasses.push_back(dataClass);
  }
  // clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  recluster();
  if (verboseLevel > 1) {
    cout << data.size() << " entries data lines read! " << clusters.size() 
	 << " clusters and ";
    unsigned int sum = 0;
    for (unsigned int kk = 0; kk < subClusters.size(); ++kk) {
      sum += subClusters[kk].size();
    }
    cout << sum << " subclusters formed." << endl;
  }
}

/** read input data */
void
KnnNode::readData(istream& is,
		  const Vec<unsigned int>& mask)
{
  if (verboseLevel > 1) {
    cout << "KnnNode: Starting readData (2)!" << endl;
  }
  ERROR_IF(mask.size()==0, "Inconsistent definition of readData!");
  string line;
  unsigned int classCol = 0; 
  unsigned int dim = mask.size();
  Vec<double> dataVec(dim, 0.0);
  this->numClasses = 1;
  unsigned int highestClass = 0; // highest defined class id so far
  while (is) {
    line = getLine(is);
    vector<string> tokens = getTokens(line);
    if (tokens.size() <= classCol) {
      continue;
    }
    // cout << "Tokens: " << tokens.size() << endl;
    classCol = tokens.size()-1;
    for (unsigned int i = 0; i < dim; ++i) {
      ERROR_IF(mask[i] >= tokens.size(), "Undefined data column!");
      dataVec[i] = stod(tokens[mask[i]]);
    }
    ERROR_IF(classCol >= tokens.size(), "Undefined class column!");
    // unsigned int dataClass = stoui(tokens[classCol]);
    unsigned int dataClass = static_cast<unsigned int>(stod(tokens[classCol]) + 0.499);
    if (dataClass > highestClass) {
      highestClass = dataClass;
      numClasses = highestClass + 1; // counting starts from zero
    }
    data.push_back(dataVec);
    dataClasses.push_back(dataClass);
  }
  scaling = Vec<double>(data[0].size(), 1.0);
  // clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  recluster();
  if (verboseLevel > 1) {
    unsigned int sum = 0;
    for (unsigned int kk = 0; kk < subClusters.size(); ++kk) {
      sum += subClusters[kk].size();
    }
    cout << data.size() << " entries data lines read! " << clusters.size() 
	 << " clusters and " << sum << " subclusters formed." << endl;
  }
  ERROR_IF(data.size() == 0, "No KnnNode data read!");
  // cout << "End of KnnNode::readData!" << endl;
}

void
KnnNode::recluster() {
  // cout << "Cutoffs: " << clusterCutoff << " " << clusterCutoff2 << endl;
  clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  subClusters = Vec<Vec<Vec<unsigned int> > >(clusters.size());
  // subClustersAbs = Vec<Vec<Vec<unsigned int> > >(clusters.size());
  clustData = Vec<Vec<Vec<double> > >(clusters.size());
  for (unsigned int i = 0; i < subClusters.size(); ++i) {
    clustData[i] = getSubset(data, clusters[i]);
    // clustClasses[i] = getSubset(dataClasses, clusters[i]);
    subClusters[i] = simpleRepresentativeLinkage(clustData[i], clusterCutoff2); // careful: new indices!!!???
    // subClustersAbs[i] = subClusters[i];
    //     for (unsigned int j = 0; j < subClusters[i].size(); ++j) {
    //       for (unsigned int k = 0; k < subClusters[i][j].size(); ++k) {
    // 	subClustersAbs[i][j][k] = clusters[i][subClusters[i][j][k]]; // convert to "absolute" indices
    //       }
    //     }
  }
  //   cout << "Clusters: " << clusters << endl;
  // cout << "Subclusters: " << subClusters << endl;
}

/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNode::optimizeScaling(int numSteps, 
			 int verboseLevel,
			 double stepWidth,
			 unsigned int numTrials)
{
  if (verboseLevel > 1) {
    cout << "Starting optimize scaling of knnNode with scaling " << scaling 
	 << " and k " << kk << endl;
  }
  Vec<double> origScaling = scaling;
  Vec<double> bestScaling = scaling;
  double origScore = estimateAccuracy(numTrials);
  double bestScore = origScore;
  double score;
  if (verboseLevel > 0) {
    cout << "Score of original scaling: " << origScore << endl;
  }
  Random& rnd = Random::getInstance();

  // optimize kk:
  unsigned int kkOrig = kk;
  unsigned int kBest = kk;
  unsigned int kMin = 3;
  unsigned int kDiff = 1; // only allow for minimal changes of k
  if (kk > (kMin+kDiff)) {
    kMin = kkOrig - kDiff;
  }
  unsigned int kMax = kkOrig + kDiff + 1;
  for (unsigned int k = kMin; k < kMax; ++k) {
    kk = k;
    score = estimateAccuracy(numTrials);    
    if (score > bestScore) {
      kBest = k;
      bestScore = score;
      if (verboseLevel > 0) {
	cout << "Found better k: " << kBest << " " << bestScore << endl;
      }
    }
  }
  if (verboseLevel > 0) {
    cout << "Now using optimized k: " << kBest << " " << bestScore << endl;
  }
  kk = kBest;

  for (unsigned int i = 0; static_cast<int>(i) < numSteps; ++i) {
    scaling = bestScaling;
    for (unsigned int j = 0; j < scaling.size(); ++j) {
      scaling[j] += stepWidth * rnd.getGaussian();
      if (scaling[j] < 0.0) {
	scaling[j] *= (-1); // invert again to positive value
      }
    }
    normalizeEuclidian(scaling);
    score = estimateAccuracy(numTrials);
    if (verboseLevel > 1) {
      cout << i + 1 << " " << score << " " << scaling << endl;
    }
    if (score > bestScore) {
      bestScore = score;
      bestScaling = scaling;
      if (verboseLevel > 0) {
	cout << i + 1 << " New best scaling: " << bestScore << " " 
	     << bestScaling << endl;
      }
    }
  }
  scaling = bestScaling;

  for (unsigned int k = kMin; k < kMax; ++k) {
    kk = k;
    score = estimateAccuracy(numTrials);    
    if (score > bestScore) {
      kBest = k;
      bestScore = score;
      if (verboseLevel > 0) {
	cout << "Found better k: " << kBest << " " << bestScore << endl;
      }
    }
  }
  if (verboseLevel > 0) {
    cout << "Now using optimized k: " << kBest << endl;
  }
  kk = kBest;

  if (verboseLevel > 0) {
    cout << "End result best scaling: " << kk << " " << bestScore << " " 
	 << bestScaling << endl;
  }
}

void
KnnNode::startStatistics() 
{
  ERROR_IF(numClasses == 0, "No classes defined when trying to start statistics mode!");	   
  trueCount = Vec<Vec<unsigned int long> >(numClasses, Vec<unsigned int long>(NUM_STAT_BINS, 0));
  falseCount = Vec<Vec<unsigned int long> >(numClasses, Vec<unsigned int long>(NUM_STAT_BINS, 0));
  statisticsRunning = true;
}

void
KnnNode::thin(unsigned int thinK)
{
  Vec<unsigned int> newIndices = knnThin(data, dataClasses, thinK);
  data = getSubset(data, newIndices);
  dataClasses = getSubset(dataClasses, newIndices);
  recluster();
}

void
KnnNode::updateStatistics(const Vec<double>& prediction, unsigned int knownClass) const
{
  unsigned int bin = (static_cast<int>(NUM_STAT_BINS * prediction[knownClass])) % NUM_STAT_BINS;
  for (unsigned int i = 0; i < numClasses; ++i) {
    if (i == knownClass) {
      ++trueCount[i][bin];
    }
    else {
      ++falseCount[i][bin];
    }
  }
}
