// --*- C++ -*------x---------------------------------------------------------
// $Id: KnnNet2.cc,v 1.2 2008/12/13 16:03:13 bindewae Exp $
//
// Class:           KnnNet2
// 
// Base class:      -
//
// Derived classes: - 
//
// Author:          Eckart Bindewald
//
// Project name:    -
//
// Date:            $Date: 2008/12/13 16:03:13 $
//
// Description:     - 
// -----------------x-------------------x-------------------x-----------------

#include <KnnNet2.h>
#include <vectornumerics.h>
#include <Random.h>
#include <clusterAlgorithms.h>
#include <FileName.h>
#include <stdio.h>
#include <generalNumerics.h>
#include <fstream>

// #define DEBUG_VERBOSE

// ---------------------------------------------------------------------------
//                                   KnnNet2
// -----------------x-------------------x-------------------x-----------------

/* CONSTRUCTORS */

/* default constructor */
KnnNet2::KnnNet2() : knnForceId(-1), numLevels(0), verboseLevel(0), writeLevel(-1), clusterCutoff(0.1), clusterCutoff2(0.01),
		     signCutoff(0.5)
{
}

/* copy constructor */
KnnNet2::KnnNet2(const KnnNet2& other)
{
  copy(other);
}

/* destructor */
KnnNet2::~KnnNet2() { 
}

/* OPERATORS */

/** Assigment operator. */
KnnNet2& 
KnnNet2::operator = (const KnnNet2& orig)
{
  if ((&orig) != this) {
    copy(orig);
  }
  return *this;
}

/** writes parameter file to stream */
ostream& 
operator << (ostream& os, const KnnNet2& rval)
{
  rval.write(os);
  return os;
}


istream& 
operator >> (istream& is, KnnNet2& rval)
{
  ERROR("Input operator not yet implemented!");
  return is;
}

/* PREDICATES */

/** Is current state valid?

    Use this predicate to check if the network connection is
    established and working. 

    @author Otto Peter 
    @return true <=> current state is valid 
    @bugs   Does not always behave like expected.
    @see    Otto Schreiber: "Writing programms with luck." */
bool
KnnNet2::isValid() const
{
  return (numLevels > 0);
}

/** How big is object? */
unsigned int
KnnNet2::size() const
{
  return knnNodes.size();
}

/** central prediction method. Returns probability of each class according to 
    vote of classes of k nearest neighbors */
Vec<double>
KnnNet2::predictClassProb(const Vec<double>& v) const
{
  // cout << "Starting KnnNet2::predictClassProb!" << endl;
  lastPrediction = predictClassProbLevel(v, numLevels);
  return lastPrediction;

}


/** central prediction method. Returns probability of each class according to 
    vote of classes of k nearest neighbors */
Vec<double>
KnnNet2::predictClassProbLevel(const Vec<double>& v, unsigned int level) const
{
//   cout << "Starting KnnNet2::predictClassProb for level " << level + 1 
//        << " " << v << endl;
  Vec<double> oldV = v; // old feature vector
  Vec<double> newV; // new feature vector
  Vec<double> subV;
  for (unsigned int i = 0; i < level; ++i) {
    newV = Vec<double>(levelTargetDims[i], 0.0);
    Vec<int> useVoting(newV.size(), 0); // if "1", use signum voting for that position
    double term;
    int jj;
    // cout << "level ids of level: " << i << " " << knnLevelIds[i] << endl;
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      if (knnLevelMultipliers[i][j] == 0.0) {
// 	if (i > 0) {
// 	  cout << "Skipping because zero weight: " << i + 1 << " " << j + 1 << endl;
// 	}
	continue;
      }
      jj = knnLevelIds[i][j];
      subV = getSubset(oldV, levelMasks[i][j]); // subset of feature vector

      if (jj >= 0) {
	const KnnNode2& node = knnNodes[jj];
	if (!node.isValid()) {
// 	  cout << "Skipping because node not sufficiently defined: " << i + 1 << " " << j + 1 << " " << jj + 1 << endl;
	  continue;
	}
	if (verboseLevel > 3) {
	  cout << "Used subset vector for knn classifier input: " 
	       << levelMasks[i][j] << " " << subV;
	}
	Vec<double> predTmp = node.predictClassProb(subV);
	ERROR_IF(predTmp.size() != 2, "Currently predictions supported only for two classes!");
	ERROR_IF((knnLevelTargetDim[i][j] >= predTmp.size()),
		 "Internal error in line 156!");
	term = predTmp[knnLevelTargetDim[i][j]];
	if (!isReasonable(term)) {
	  cout << "Warning: Unreasonable prediction accuracy found!"
	       << " Input: " << v << " " 
	       << " level: " << level + 1 << " " 
	       << i + 1 << endl;
	  term = 0.0;
	}
	ERROR_IF(!isReasonable(knnLevelMultipliers[i][j]),
		 "internal error in line 161!");
	newV[knnLevelTargetIds[i][j]] += term * knnLevelMultipliers[i][j];
	ERROR_IF(!isReasonable(newV[knnLevelTargetIds[i][j]]),
		 "internal error in line 164!");
      }
      else if (jj == -2) { // use simple sum if id < 0: 
	if (verboseLevel > 2) {
	  cout << "Used subset vector for linear classifier input: " << i << " " << j << " "
	       << levelMasks[i][j] << " " << subV;
	}
	for (unsigned int k = 0; k < subV.size(); ++k) {
// 	  cout << "ok2: " << i << " " << j << " " << k << " " << knnLevelTargetIds[i][j] << " "
	  newV[knnLevelTargetIds[i][j]] += knnLevelMultipliers[i][j] * subV[k];
	}
      }
      else if (jj == -3) { // use sum centered around 0.5
	if (verboseLevel > 0) {
	  cout << "Used voting among subset vector for linear classifier input: " << i << " " << j << " "
	       << levelMasks[i][j] << " " << subV;
	}
	ERROR_IF(i < 1, "Voting nodes cannot be used in first level!");
	for (unsigned int k = 0; k < subV.size(); ++k) {	
   	  // cout << "ok: " << i << " " << j << " " << k << " " << knnLevelTargetIds[i][j] << " "
	  // << knnLevelVoteMultipliers[i][j][k] << " " << subV[k] << " " <<  signum(subV[k]-signCutoff) << endl; 
	  ERROR_IF(!isReasonable(subV[k]), "unreasonable number detected as part of feature vec!");
	  newV[knnLevelTargetIds[i][j]] += knnLevelVoteMultipliers[i][j][k] * signum(subV[k]-signCutoff);
	}
	useVoting[knnLevelTargetIds[i][j]] = 1; // use voting for that position
      }
      else {
	ERROR("Undefined knn node id!");
      }
    }
    for (unsigned int k = 0; k < newV.size(); ++k) {
      if (useVoting[k] == 1) {
	//	if (newV[k] != 0.0) {
	//   	  cout << "Using voting for level " << level + 1 << " and position " << k + 1 << " "
	//   	       << newV[k] << " " << ((0.5 * newV[k]) + 0.5) << " " << logistic(newV[k]) << endl;
	//	}
	// newV[k] = (0.5 * newV[k]) + 0.5;
	newV[k] = logistic(newV[k]);
      }
    }
    oldV = newV;
    if (verboseLevel > 2) {
      cout << "Intermediate result of predictClassProbLevel called with level " 
	   << level + 1<< " " << i + 1 << " : " << newV << " called with: " 
	   << v;
    }    
  }
  // cout << "Ending KnnNet2::predictClassProb!" << endl;
  if (verboseLevel > 2) {
    cout << "Result of predictClassProbLevel called with level " 
	 << level << " and vector " << v << " : "
	 << newV << endl;
  }
  return newV;
}

/** central prediction method. Returns probability of each class according to 
    vote of classes of k nearest neighbors */
Vec<double>
KnnNet2::predictClassProb(const Vec<double>& v, unsigned int knownClass) const
{
  if (knownClass == 1) {
    cout << "Starting KnnNet2::predictClassProb with known class " << knownClass << " " << v << endl;
  }
  Vec<double> oldV = v; // old feature vector
  Vec<double> newV; // new feature vector
  Vec<double> subV;
  int jj;
  double term;
  for (unsigned int i = 0; i < numLevels; ++i) {
    newV = Vec<double>(levelTargetDims[i], 0.0);
    Vec<int> useVoting(newV.size(), 0); // if "1", use signum voting for that position
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      if (knnLevelMultipliers[i][j] == 0.0) {
	continue;
      }
      jj = knnLevelIds[i][j];
      if (jj >= 0) {
	const KnnNode2& node = knnNodes[jj];
	if (!node.isValid()) {
	  continue;
	}
	subV = getSubset(oldV, levelMasks[i][j]); // subset of feature vector
	term = node.predictClassProb(subV, knownClass)[knnLevelTargetDim[i][j]];
	newV[knnLevelTargetIds[i][j]] += knnLevelMultipliers[i][j] * term;
      }
      else if (jj == -2) { // use simple sum if id < 0: 
	subV = getSubset(oldV, levelMasks[i][j]); // subset of feature vector
	for (unsigned int k = 0; k < subV.size(); ++k) {
	  newV[knnLevelTargetIds[i][j]] += knnLevelMultipliers[i][j] * subV[k];
	}
      }
      else if (jj == -3) { // use sum centered around 0.5
	// if (verboseLevel > 0) {
	if (knownClass == 1) {
	  cout << "Used voting among subset vector for linear classifier input: " << i << " " << j << " " 
	       << levelMasks[i][j] << " " << subV;
	}
	  // }
	for (unsigned int k = 0; k < subV.size(); ++k) {
	  newV[knnLevelTargetIds[i][j]] += knnLevelVoteMultipliers[i][j][k] * signum(subV[k]-signCutoff);
	}
	useVoting[knnLevelTargetIds[i][j]] = 1; // use voting for that position
      }
      else {
	ERROR("Undefined knn node id!");
      }
    }
    for (unsigned int k = 0; k < newV.size(); ++k) {
      if (useVoting[k] == 1) {
	if (knownClass == 1) {
	  cout << "Using voting for position " << k + 1 << " " << newV[k] << " " << ((0.5 * newV[k]) + 0.5) << endl;
	}
	// newV[k] = (0.5 * newV[k]) + 0.5;
	newV[k] = logistic(newV[k]);
      }
    }
    oldV = newV;
  }
  if (knownClass == 1) {
    cout << "Ending KnnNet2::predictClassProb!" << endl;
  }
  lastPrediction = newV;
  return newV;
}

/** central prediction method. Returns probability of each class according to 
    vote of classes of k nearest neighbors.
    Supply known answer to prediction, used for writing intermediate data to training files. */
Vec<double>
KnnNet2::predictClassProbTrain(const Vec<double>& v, unsigned int trueClass,
			      ostream& writeFile) const
{
  // cout << "Starting KnnNet2::predictClassProb!" << endl;
  Vec<double> oldV = v; // old feature vector
  Vec<double> newV; // new feature vector
  Vec<double> subV;
  int jj;
  double term;
  for (unsigned int i = 0; i < numLevels; ++i) {
#ifdef DEBUG_VERBOSE
    cout << "Working on knn level: " << i + 1 << endl;
#endif
    newV = Vec<double>(levelTargetDims[i], 0.0);
    Vec<int> useVoting(newV.size(), 0); // if "1", use signum voting for that position
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      if (knnLevelMultipliers[i][j] == 0.0) {
	continue;
      }
      jj = knnLevelIds[i][j];
      // cout << "Working on knn node: " << jj + 1 << endl;
      if (jj >= 0) {
	const KnnNode2& node = knnNodes[jj];
	if (!node.isValid()) {
	  continue;
	}
	subV = getSubset(oldV, levelMasks[i][j]); // subset of feature vector
	if (!node.isValid()) {
// 	  cout << "Skipping because node not sufficiently defined: " << i + 1 << " " << j + 1 << " " << jj + 1 << endl;
	  continue;
	}
	term = node.predictClassProb(subV, trueClass)[knnLevelTargetDim[i][j]];
#ifdef DEBUG_VERBOSE
	cout << "Input feature vector for node " << jj + 1 << " " << subV 
	     << " result: " << term << " " << knnLevelTargetIds[i][j] 
	     << " " << knnLevelMultipliers[i][j] << endl;
#endif
	newV[knnLevelTargetIds[i][j]] += knnLevelMultipliers[i][j] * term;
      }
      else if (jj == -2) { // use simple sum if id < 0: 
	subV = getSubset(oldV, levelMasks[i][j]); // subset of feature vector
	for (unsigned int k = 0; k < subV.size(); ++k) {
	  newV[knnLevelTargetIds[i][j]] += knnLevelMultipliers[i][j] * subV[k];
	}
      }
      else if (jj == -3) { // use sum centered around 0.5
	if (verboseLevel > 0) {
	  cout << "Used voting among subset vector for linear classifier input: " 
	       << levelMasks[i][j] << " " << subV;
	}
	for (unsigned int k = 0; k < subV.size(); ++k) {
	  newV[knnLevelTargetIds[i][j]] += knnLevelVoteMultipliers[i][j][k] * signum(subV[k]-signCutoff);
	}
	useVoting[knnLevelTargetIds[i][j]] = 1; // use voting for that position
      }
      else {
	ERROR("Undefined knn node id!");
      }
      for (unsigned int k = 0; k < newV.size(); ++k) {
	if (useVoting[k] == 1) {
	  cout << "Using voting for position " << k + 1 <<  " " 
	       << newV[k] << " " << ((0.5 * newV[k]) + 0.5) << endl;
	  // newV[k] = (0.5 * newV[k]) + 0.5;
	  newV[k] = logistic(newV[k]);
	}
      }
    }
#ifdef DEBUG_VERBOSE
    cout << "Intermediate feature vector after level " << i + 1 << " : " << newV << endl;
#endif
    if (writeLevel == static_cast<int>(i)) {
      ERROR_IF(!writeFile, "Error writing to knn training file!");
      for (unsigned int k = 0; k < newV.size(); ++k) {
	writeFile << newV[k] << " ";
	ERROR_IF(!writeFile, "Error writing to knn training file!");
      }
      writeFile << trueClass << endl;
    }
    oldV = newV;
  }
  // cout << "Ending KnnNet2::predictClassProb!" << endl;
  lastPrediction = newV;
  return newV;
}

/** write intermediate feature vector for all training vector */
void
KnnNet2::writeLevelTrainVectors(ostream& writeFile, int numEntries) const
{
  Random& rnd = Random::getInstance();
  if (numEntries > 0) {
    for (int i = 0; i < numEntries; ++i) {
      unsigned int nn = rnd.getRand(size()); // random knn node
      unsigned int rrow = rnd.getRand(knnNodes[nn].size()); // random entry:
      predictClassProbTrain(knnNodes[nn].getDataRow(rrow), 
			    knnNodes[nn].getDataRowClass(rrow), writeFile);
    }
  }
  else if (numEntries == 0){
    for (int i = 0; i < static_cast<int>(size()); ++i) {
      for (unsigned int j = 0; j < knnNodes[i].size(); ++j) {
	predictClassProbTrain(knnNodes[i].getDataRow(j), 
			      knnNodes[i].getDataRowClass(j), writeFile);
      }
    }
  }
  else if (numEntries < 0) {
#ifdef DEBUG_VERBOSE
    cout << "starting writeLevelTrainFile in smart mode!" << endl;
#endif
    // indices of data rows belonging to class "1"
    Vec<unsigned int> goodOnes = knnNodes[0].getDataIndices(1);
    for (unsigned int jj = 0; jj < goodOnes.size(); ++jj) {
      unsigned int j = goodOnes[jj];
#ifdef DEBUG_VERBOSE
      cout << "Using data vector : " << knnNodes[0].getDataRow(j) 
	   << "   " << knnNodes[0].getDataRowClass(j) << endl;
#endif    
      predictClassProbTrain(knnNodes[0].getDataRow(j), 
			    knnNodes[0].getDataRowClass(j), writeFile);
      unsigned int k = rnd.getRand(knnNodes[0].size());
#ifdef DEBUG_VERBOSE
      cout << "Using data vector : " << knnNodes[0].getDataRow(k) 
	   << "   " << knnNodes[0].getDataRowClass(k) << endl;
#endif
      predictClassProbTrain(knnNodes[0].getDataRow(k), 
			    knnNodes[0].getDataRowClass(k), writeFile);
    }
  }
}

/** writes parameter file */
void
KnnNet2::write(ostream& os) const
{
  os << knnNodes.size() << endl;
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    os << knnOrigFileNames[i] << " " << knnNodes[i].getK() << " ";
    os << knnOrigMasks[i].size() << "  ";
    for (unsigned int j = 0; j < knnOrigMasks[i].size(); ++j) {
      os << knnOrigMasks[i][j]+1 << " ";
    }
    os << knnNodes[i].getScaling();
  }
  os << numLevels << endl;
  for (unsigned int i = 0; i < numLevels; ++i) {
    os << knnLevelIds[i].size() << " " << levelTargetDims[i] << endl;
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      os << knnLevelIds[i][j]+1 << " " << knnLevelTargetIds[i][j]+1 << " "
	 << knnLevelTargetDim[i][j]+1 << " "
	 << knnLevelMultipliers[i][j] << " ";
      Vec<unsigned int> v = externalCounting(levelMasks[i][j]);
      os << v.size() << "  ";
      for (unsigned int k = 0; k < v.size(); ++k) {
	os << v[k] << " ";
      }
      if (knnLevelIds[i][j] == -3) {
	os << knnLevelVoteMultipliers[i][j]; // special case for voting node
      }
      else {
	os << endl;
      }
    }
  }
}

void
KnnNet2::saveData(string saveDir) const {
  addSlash(saveDir);
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    string fileName = saveDir + knnOrigFileNames[i];
    ofstream outFile(fileName.c_str());
    ERROR_IF(!outFile, "Error opening knn data output file!");
    knnNodes[i].writeData(outFile);
    outFile.close();
  }
}

/* MODIFIERS */

/* copy method */
void 
KnnNet2::copy(const KnnNet2& other)
{
    knnForceId = other.knnForceId;
    numLevels = other.numLevels;
    verboseLevel = other.verboseLevel;
    writeLevel = other.writeLevel;
    clusterCutoff = other.clusterCutoff;
    clusterCutoff2 = other.clusterCutoff2;
    signCutoff = other.signCutoff;
    writeFileName = other.writeFileName;
    knnNodes = other.knnNodes;
    properties = other.properties;
    knnOrigMasks = other.knnOrigMasks;
    knnOrigFileNames = other.knnOrigFileNames;
    knnLevelIds = other.knnLevelIds;
    levelMasks = other.levelMasks;
    levelTargetDims = other.levelTargetDims;
    knnLevelTargetIds = other.knnLevelTargetIds;
    knnLevelTargetDim = other.knnLevelTargetDim;
    knnLevelMultipliers = other.knnLevelMultipliers;  
    lastPrediction = other.lastPrediction;
}

/** reads definition network of k-nearest neighbor nodes */
void
KnnNet2::read(istream& is, string dataDir)
{
#ifdef DEBUG_VERBOSE      
  cout << "Starting KnnNet2::read!" << endl;
#endif
  clear();
  unsigned int line = 0;
  // read and define Knn nodes:
  if (dataDir.size() == 0) {
    dataDir = "./";
  }
  else if (dataDir[dataDir.size()-1] != '/') {
    dataDir = dataDir + "/";
  }
  int numNodes;
#ifdef DEBUG_VERBOSE      
  cout << "Reading KnnNet2 parameter file line " << (line++) + 1 << endl;
#endif
  is >> numNodes;
  knnNodes = Vec<KnnNode2>(numNodes);
  knnOrigMasks = Vec<Vec<unsigned int> >(numNodes);
  knnOrigFileNames = Vec<string>(numNodes);
  for (int i = 0; i < numNodes; ++i) {
    string dataFileName;
    unsigned int kk; 
    Vec<unsigned int> knnMask;
    Vec<double> knnScale;
    // #ifdef DEBUG_VERBOSE      
    // #endif
    is >> dataFileName >> kk >> knnMask >> knnScale;
    if (verboseLevel > 1) {
      cout << "Reading KnnNet2 parameter file line " << (line++) + 1
	   << " : Filename: " << dataFileName << " ";
    }
    convert2InternalCounting(knnMask);
    knnOrigFileNames[i] = dataFileName;
    knnOrigMasks[i] = knnMask;
    dataFileName = dataDir + dataFileName;
    knnNodes[i].setClusterCutoff(clusterCutoff);
    // knnNodes[i].setClusterCutoff2(clusterCutoff2);
    ifstream dataFile(dataFileName.c_str());
    if(dataFile) {
      knnNodes[i].readData(dataFile, knnMask);
    }
    else {
      cout << "Could not find data file " << dataFileName 
	   << " . Generating generic knnNode." << endl;
      Vec<Vec<double> > dummyData(KNNNET_DUMMY_SIZE, Vec<double>(knnMask.size(), 0.0));
      Vec<unsigned int> dummyClasses(KNNNET_DUMMY_SIZE, 0U);
      dummyClasses[0] = 0;
      dummyClasses[1] = 1;
      knnNodes[i].setData(dummyData, dummyClasses, 2U, knnScale);
    }
    if (verboseLevel > 1) {
      cout << " ." << endl;
    }
    knnNodes[i].setK(kk);
    knnNodes[i].setScaling(knnScale);
    dataFile.close();
    if (kk == 1) {
      string lookupFileName;
      is >> lookupFileName;
      lookupFileName = dataDir + lookupFileName;
      cout << "Reading lookup file: " << lookupFileName << endl;
      ifstream lookupFile(lookupFileName.c_str());
      ERROR_IF(!lookupFile, "Error opening lookup file name (expected for k == 1)!");
      knnNodes[i].readLookupDataProb(lookupFile, knnMask);
      lookupFile.close();
    }
  }
  ERROR_IF(!is, "Error after reading first section of knn net file!");
  // define levels:
#ifdef DEBUG_VERBOSE      
  cout << "Reading KnnNet2 parameter file line " << (line++) + 1 << endl;
#endif
  is >> numLevels;
  knnLevelIds = Vec<Vec<int> >(numLevels);
  knnLevelTargetIds = Vec<Vec<unsigned int> >(numLevels);
  knnLevelTargetDim = Vec<Vec<unsigned int> >(numLevels);
  knnLevelMultipliers = Vec<Vec<double > >(numLevels);
  knnLevelVoteMultipliers = Vec<Vec<Vec<double > > >(numLevels);
  levelMasks = Vec<Vec<Vec<unsigned int> > >(numLevels);
  // loop over levels
  ERROR_IF(numLevels > KNNNET_LEVELS_MAX, "Too large number of levels!");
  for (unsigned int i = 0; i < numLevels; ++i) {
    unsigned int numLevelKnns;
    unsigned int targetDim;
#ifdef DEBUG_VERBOSE      
    cout << "Reading KnnNet2 parameter file line " << (line++) + 1 << endl;
#endif
    is >> numLevelKnns >> targetDim;
    ERROR_IF(!is, "Error after reading level header of knn net file!");
    levelTargetDims.push_back(targetDim);
    // loop over knns
    ERROR_IF(numLevelKnns > KNNNET_LEVELNODES_MAX, "Too large number of knn's per level!");
    for (unsigned int j = 0; j < numLevelKnns; ++j) {
#ifdef DEBUG_VERBOSE      
      cout << "i j: " << i << " " << j << endl;
#endif
      int knnId;
      unsigned int targetId;
      unsigned int knnDim;
      Vec<unsigned int> mask;
      Vec<double> voteMultipliers;
      double mul;
#ifdef DEBUG_VERBOSE      
      cout << "Reading KnnNet2 parameter file line " << (line++) + 1 << endl;
#endif
      is >> knnId >> targetId >> knnDim >> mul >> mask;
      ERROR_IF(!is, "Error in reading body of knn net file!");
      --knnId; // convert to internal counting, smaller zero: use linear class
      ERROR_IF(knnId >= numNodes,
	       "knn node index too high!");
      ERROR_IF(targetId < 1, "knn target id must be greater zero!");      --targetId; // index   
      ERROR_IF(targetId >= targetDim,
	       "Target dimension too high!");
      ERROR_IF(knnDim == 0, "knn dim must be greater zero!");
      --knnDim;
      if (knnId == -3) {
	cout << "Reading vote multipliers!" << endl;
	is >> voteMultipliers;
      }
      else {
	if (mask.size() > 0) {
	  double term = 1.0 / static_cast<double>(mask.size());
	  voteMultipliers = Vec<double>(mask.size(), term);
	}
	else {
	  voteMultipliers.clear();
	}
      }
      convert2InternalCounting(mask);
      knnLevelIds[i].push_back(knnId);
      knnLevelTargetIds[i].push_back(targetId);
      knnLevelMultipliers[i].push_back(mul);
      knnLevelVoteMultipliers[i].push_back(voteMultipliers);
      knnLevelTargetDim[i].push_back(knnDim);
      levelMasks[i].push_back(mask);
#ifdef DEBUG_VERBOSE      
      cout << "read knn data: Id: " << knnId << " target id: " << targetId << " dim: " << knnDim << " mul: "
	   << mul << " mask: " << mask << endl;
#endif
    }
  }
  ERROR_IF(!is, "Error after reading last section of knn net file!");
#ifdef DEBUG_VERBOSE      
  cout << "Ending KnnNet2::read!" << endl;
#endif
}

/** returns level to which node with this id belongs */
unsigned int
KnnNet2::nodeLevel(unsigned int nodeId, unsigned int& levelId) const {
  for (unsigned int i = 0; i < knnLevelIds.size(); ++i) {
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      if (knnLevelIds[i][j] == static_cast<int>(nodeId)) { // node id found
	levelId = j; // id with respect to this level
	return i; // return level
      }
    }
  }
  return numLevels; // node not found
}

/** returns all nodes of level */
Vec<unsigned int>
KnnNet2::findLevelNodes(unsigned int level) const {
  Vec<unsigned int> result;
  unsigned int levelId;
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    if (nodeLevel(i, levelId) == level) {
      result.push_back(i);
    }
  }
  return result;
}

/** optimize scaling of node using simple Monte Carlo steps */
Vec<double>
KnnNet2::optimizeScaling(unsigned int knnId,
			int numSteps,
			int verboseLevel,
			double stepWidth,
			unsigned int numTrials)
{
  ERROR_IF(knnId >= knnNodes.size(), "KnnNet2::optimizeScaling: too large knn node index!");
  knnNodes[knnId].optimizeScaling(numSteps, verboseLevel, stepWidth, numTrials);
  return knnNodes[knnId].getScaling();
}

/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNet2::bootstrapLevelNode(const Vec<Vec<double> >& levelData,
			   const Vec<unsigned int>& levelClasses,
			   unsigned int knnId,
			   int numSteps,
			   int knnThin,
			   int verboseLevel,
			   double stepWidth,
			   unsigned int numTrials,
			   unsigned int simMax,
			   double cutoff,
			   const string& writeDirOrig)
{
  ERROR_IF(knnId >= knnNodes.size(), "KnnNet2::bootstratLevelNode: too large node index!");
  // find level id:
  if (verboseLevel > 0) {
    cout << "Starting boorstrapLevelNode for node " << knnId + 1 << endl;
  }
  unsigned int level = 0;
  int levelKnnId = 0; // what id has knnId in that level?
  bool found = false;
  ERROR_IF(numLevels != knnLevelIds.size(), 
	   "Internal error in line 517!");
  for (unsigned int i = 0; i < numLevels; ++i) {
    if (verboseLevel > 0) {
      cout << "Searching level " << i + 1 << "  "<<  knnLevelIds[i];
    }
    // cout << "Defined masks: " << levelMasks[i] << endl;
    for (unsigned int j = 0; j < knnLevelIds[i].size(); ++j) {
      if (knnLevelIds[i][j] == static_cast<int>(knnId)) {
	level = i;
	levelKnnId = j;
	found = true;
	break;
      }
    } 
    if (found) {
      break;
    }
  }
  ERROR_IF(!found, "Node not found!");
  if (verboseLevel > 0) {
    cout << "Found level: " << level + 1 << " " << levelKnnId + 1 << endl;
  }
  if (levelKnnId < 0) {
    cout << "id smaller zero!" << endl;
    return;
  }
  // determine input raw data:
  const Vec<unsigned int>& mask = levelMasks[level][levelKnnId];
  // Vec<unsigned int> mask = generateStair(mask.size());
  if (verboseLevel > 0) {
    cout << "used mask: " << externalCounting(mask) << endl;
    cout << "first line of orig input vector: " 
	 << levelData[0] << " class : " << levelClasses[0] << endl;
  }
  Vec<Vec<double> > usedData = getColumns(levelData, mask);
  ERROR_IF(!isRectangle(usedData), "Internal error in line 555!");
  if (verboseLevel > 0) {
    cout << "Starting cluster algorithm!" << endl;
  }

  Vec<unsigned int> subs = flatten(clusterRandomSubsets(simpleRepresentativeLinkage(usedData, cutoff),
				      simMax));
  if (verboseLevel > 0) {
    cout << "result of clustering: " << subs.size() << " entries." << endl;
  }
  usedData = getSubset(usedData, subs);
  Vec<unsigned int> usedClasses = getSubset(levelClasses, subs);
  ERROR_IF(usedData.size() != usedClasses.size(),
	   "Internal error in line 554!");
  // sort(usedData.begin(), usedData.end()); // program crashes!???
  // usedClasses = getSubset(usedS
  // maybe later
  // cout << "First line again: " << usedData[0] << endl;
  // cout << "Mask: " << stairMask << endl;
  //   sort(usedData.begin(), usedData.end());
  //   subs = uniquify(usedData, stairMask, simMax, cutoff);  
  //   cout << "result of second uniquify: " << subs.size() << " entries." << endl;
  //   usedData = getSubset(usedData, subs);
  //   usedClasses = getSubset(usedClasses, subs);
  // maybe cluster more here?
  /// set input data
  if (verboseLevel > 0) {
    cout << "Setting data: " << usedData.size() << " rows." << endl;
  }
  knnNodes[knnId].setData(usedData, usedClasses, 
			  knnNodes[knnId].getNumClasses(),
			  knnNodes[knnId].getScaling());
  //   if (verboseLevel > 0) {
  //     cout << "Optimizing scaling!" << endl;
  //   }
  //   knnNodes[knnId].optimizeScaling(numSteps, verboseLevel, 
  // 				  stepWidth, numTrials);
  // save data of knn node:
  // knnOrigFileNames[knnId] = fileName(knnOrigFileNames[knnId]);

  if (knnThin > 0) {
    cout << "Thinning data! Starting node " << " " << knnId + 1 << " size "
	 << knnNodes[knnId].size() << endl;
    knnNodes[knnId].thin(static_cast<unsigned int>(knnThin));
    cout << "Thinning data finished: node " << " " << knnId + 1 << " size "
	 << knnNodes[knnId].size() << endl;
  }

  string writeDir = writeDirOrig;
  addSlash(writeDir);
  string outFileName = writeDir + knnOrigFileNames[knnId];
  if (verboseLevel > 0) {
    cout << "Writing new data of node " << knnId + 1 
	 << " to file " << outFileName << endl;
  }
  ofstream outFile(outFileName.c_str());
  ERROR_IF(!outFile, "Error opening output file!");
  knnNodes[knnId].writeData(outFile);
  outFile.close();
}

/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNet2::bootstrapLevel(const Vec<Vec<double> >& levelData,
		       const Vec<unsigned int>& levelClasses,
		       unsigned int level,
		       int numSteps,
		       int thin,
		       int verboseLevel,
		       double stepWidth,
		       unsigned int numTrials,
		       unsigned int simMax,
		       double cutoff,
		       const string& writeDir)
{
  ERROR_IF(level >= numLevels, "KnnNet2::bootstratLevel: too large level index!");
  cout << "Starting bootstrapLevel for level " << level + 1 << endl;
  Vec<unsigned int> idSet = findLevelNodes(level); // return all nodes of level
  for (unsigned int ii = 0; ii < idSet.size(); ++ii) { // loop over nodes of level
    if (knnNodes[idSet[ii]].size() <= KNNNET_DUMMY_SIZE) { // only optimize if dummy was given so far
      bootstrapLevelNode(levelData, levelClasses, idSet[ii], numSteps, 
			 thin, verboseLevel, stepWidth, numTrials,
			 simMax, cutoff, writeDir);
    }
    cout << "Estimated accuracy for node " << idSet[ii] + 1 << " " 
	 << knnOrigFileNames[idSet[ii]] << " : " 
	 << knnNodes[idSet[ii]].estimateAccuracy(1000) 
	 << " size: " <<  knnNodes[idSet[ii]].size() << endl;
  }
    // knnNodes[knnId].optimizeScaling(numSteps, verboseLevel, stepWidth, numTrials);
  cout << "Finished bootstrap level!" << endl;
}

/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNet2::bootstrap(const Vec<Vec<double> >& rawData,
		  const Vec<unsigned int>& rawClasses,
		  int numSteps,
		  int thinK,
		  int verboseLevel,
		  double stepWidth,
		  unsigned int numTrials,
		  unsigned int simMax,
		  double cutoff,
		  const string& writeDir)
{
  PRECOND(rawData.size() == rawClasses.size());
  cout << "Starting bootstrap with " 
       << rawData.size() << " lines " << endl;
  // what are the training vectors for current level?
  Vec<Vec<double> > levelData(rawData.size()); 
  bootstrapLevel(rawData, rawClasses, 0, numSteps, thinK, verboseLevel, 
		 stepWidth, numTrials, simMax, cutoff, writeDir);
  if (verboseLevel > 0) {
    cout << "Finished generating input data for level " << 1 << endl;
  }
  for (unsigned int k = 1; k < numLevels; ++k) { // loop over levels
    if (verboseLevel > 0) {
      cout << "Starting to generate input data for level " << k + 1 << endl;
    }
    for (unsigned int i = 0; i < rawData.size(); ++i) {
      levelData[i] = predictClassProbLevel(rawData[i], k);
    }
    if (verboseLevel > 0) {
      cout << "Finished generating input data for level " << k + 1 << endl;
    }
    bootstrapLevel(levelData, rawClasses, k, numSteps, thinK, verboseLevel, 
		   stepWidth, numTrials, simMax, cutoff, writeDir);
  }
  if (verboseLevel > 0) {
    cout << "Finished bootstrap!" << endl;
  }
}

/** returns number of letters in common. Not counting path and ending, case-sensitive */
int
KnnNet2::fileNameSimilarity(string s1, string s2)
{
  s1 = fileNameBase(s1);
  s2 = fileNameBase(s2);
  int counter = 0;
  for (unsigned int i = 0; i < s1.size(); ++i) {
    for (unsigned int j = 0; j < s2.size(); ++j) {
      if (s1[i] == s2[j]) {
	++counter;
      }
    }
  }
  return counter;
}

/** optimize scaling of node using simple Monte Carlo steps */
Vec<double>
KnnNet2::boostLevel(const Vec<Vec<double> >& levelData,
		    const Vec<unsigned int>& levelClasses,
		    unsigned int level,
		    int numSteps,
		    int thin,
		    int verboseLevel,
		    double stepWidth,
		    unsigned int numTrials,
		    unsigned int simMax,
		    double cutoff,
		    const string& writeDirOrig)
{
  ERROR_IF(level >= numLevels, "KnnNet2::bootstratLevel: too large level index!");
  cout << "Starting boostLevel for level " << level + 1 << endl;
  string writeDir = writeDirOrig;
  addSlash(writeDir);
  Vec<double> dataWeights(levelData.size(), 1.0 / static_cast<double>(levelData.size())); // initial weights
  Vec<unsigned int> idSet = findLevelNodes(level); // return all nodes of level
  cout << "The folllowing node ids where found on this level: " << idSet << endl;

//   for (unsigned int i = 0; i < knnLevelMultipliers[level].size(); ++i) {
//     for (unsigned int j = 0; j < knnLevelMultipliers[level][i].size(); ++j) {
//       knnLevelVoteMultipliers[level][i][j] = 1.0 / static_cast<double>(knnLevelVoteMultipliers[level][i].size());
//     }
//   }
  if (idSet.size() == 0) {
    return Vec<double>();
  }
  Vec<double> alphaVec(idSet.size(), 0.0); // 1.0 / static_cast<double>(idSet.size()));
  Vec<string> nameVec(alphaVec.size());
  Vec<string> goodNames;
  Vec<double> goodAlpha;
  double epsilon = 0.0;
  Vec<unsigned int> usedAlready; // store already used classifier ids
  bool forcePicked = false;
  // while ((epsilon < 0.5) && (usedAlready.size() < idSet.size())) {
  while ((epsilon < 1.0) && (usedAlready.size() < idSet.size())) {
    unsigned int iiBest = 0; // remember best id
    double bestEpsilon = 0.0;
    bool first = true;
    unsigned int levelIdBest = 0;
    Vec<int> bestMisClass;
    unsigned int bestMisClassNum;
    for (unsigned int ii = 0; ii < idSet.size(); ++ii) { // loop over nodes of level
      unsigned int levelId = 0;
      nodeLevel(idSet[ii], levelId);
      if (findFirstIndex(usedAlready, idSet[ii]) < usedAlready.size()) {
	continue;
      }
      if ((!forcePicked) && (knnForceId >= 0) && (static_cast<int>(idSet[ii]) != knnForceId)) {
	continue;
      }
      const Vec<unsigned int>& mask = levelMasks[level][levelId]; // get correct mask
      if (knnNodes[idSet[ii]].size() <= KNNNET_DUMMY_SIZE) { // only optimize if dummy was given so far
	bootstrapLevelNode(levelData, levelClasses, idSet[ii], numSteps, 
			   thin, verboseLevel, stepWidth, numTrials, simMax, cutoff, writeDir);
      }
      cout << "Estimated accuracy for node " << idSet[ii] + 1 << " " 
	   << knnOrigFileNames[idSet[ii]] << " : " 
	   << knnNodes[idSet[ii]].estimateAccuracy(1000) 
	   << " size: " <<  knnNodes[idSet[ii]].size() << " level id: " << levelId << " mask: " << mask;
      // apply new node on training data -> store misclassified ids and weights of training data
      unsigned int cl;
      Vec<int> misClassified(levelData.size());
      unsigned int misClassNum = 0;
      Vec<Vec<double> > usedData = getColumns(levelData, mask);
      bool allPerfect = true;
      for (unsigned int j = 0; j < levelData.size(); ++j) {
	cl = findMaxIndex(knnNodes[idSet[ii]].predictClassProb(usedData[j]));
	if (cl != levelClasses[j]) {
	  misClassified[j] = 1; // used for boosting bad cases
	  allPerfect = false;
	  ++misClassNum;
	}
	else {
	  misClassified[j] = -1;
	  // misClassified[j] = 0;// weaker decay!!!
	}
      }
      // break here if nothing misclassified
      if (allPerfect) {
	for (unsigned int i = 0; i < alphaVec.size(); ++i) {
	  alphaVec[i] = 0.0;
	}
	alphaVec[levelId] = 1.0;
	goodAlpha.push_back(1.0);
	goodNames.push_back(nameVec[levelId]);
	cout << "Result of boostLevelNode: " << misClassNum
	     << " nodes misclassified, perfect classifier found!"
	     << endl;
	// TODO : avoid code duplication with end of method
	cout << "Weights of level (no normalization): " << alphaVec;
	cout << "Names of level: " << nameVec;
	cout << "Good weights of level (no normalization): " << goodAlpha;
	cout << "Good names of level: " << goodNames;
	return alphaVec; // only this classifier needed!
      }
      // determine new weight of classifier
      double epsilonTmp = 0.0;
      double norm = 0.0;
      for (unsigned int j = 0; j < misClassified.size(); ++j) {
	norm += dataWeights[j];
	if (misClassified[j] == 1) {
	  epsilonTmp += dataWeights[j];
	}
      }
      ASSERT(norm > 0.0);
      epsilonTmp /= norm;
      if (first  || (epsilonTmp < bestEpsilon)) {
	bestEpsilon = epsilonTmp;
	iiBest = ii;
	bestMisClass = misClassified;
	levelIdBest = levelId;
	first = false;
	bestMisClassNum = misClassNum;
	if ((knnForceId >= 0) && (static_cast<int>(idSet[ii]) == knnForceId)) {
	  forcePicked = true;
	}
      }
    }
    usedAlready.push_back(idSet[iiBest]);
    // also check off all similar names:
    for (unsigned int i3 = 0; i3 < idSet.size(); ++i3) { 
      if ((idSet[i3] == idSet[iiBest]) || (findFirstIndex(usedAlready, idSet[i3]) < usedAlready.size())) {
	continue;
      }
      if (fileNameSimilarity(knnOrigFileNames[idSet[iiBest]], knnOrigFileNames[idSet[i3]]) > 1) {
	cout << "Also dissallowing " << knnOrigFileNames[idSet[i3]] << " because too similar to " 
	     << knnOrigFileNames[idSet[i3]] << endl;
	usedAlready.push_back(idSet[i3]);
      }
    }
    epsilon = bestEpsilon;
    ERROR_IF(epsilon <= 0.0,
	     "Internal error in line 983!");
    if (epsilon >= 1.0) {
      cout << "Epsilon greater one: " << epsilon << endl;
      break;
    }
    double alpha = log((1.0 - epsilon) / epsilon);
    nameVec[levelIdBest] = knnOrigFileNames[idSet[iiBest]];
    cout << "Best node found is: " << iiBest + 1 << " " << idSet[iiBest] + 1 << " " 
	 << knnOrigFileNames[idSet[iiBest]] << " " << levelIdBest + 1 << " epsilon: " 
	 << bestEpsilon << " alpha: " << alpha << " with " << bestMisClassNum << " misclassified records." << endl;
    if (alpha > 0.0) {
      // update weight of training data
      for (unsigned int j = 0; j < bestMisClass.size(); ++j) {
	dataWeights[j] *= exp(alpha * bestMisClass[j]); // downregulate good cases ("-1"), boost bad cases ("1")
	// dataWeights[j] *= exp(0.25 * alpha * bestMisClass[j]); // downregulate good cases ("-1"), boost bad cases ("1")
      }
      // setting weight of node equal to alpha: 
      // knnLevelMultipliers[level][levelId] = alpha;
      alphaVec[levelIdBest] = alpha;
      goodAlpha.push_back(alpha);
      goodNames.push_back(nameVec[levelIdBest]);
      cout << "Setting alpha: " << alpha << " " << level << " " << levelIdBest << " "
	   <<  knnLevelMultipliers[level][levelIdBest] << endl;
      // probabilityNormalize(dataWeights); // normalize such that sum is one
    }
    else {
      knnNodes[idSet[iiBest]].clear(); // clear bad classifier
      // knnLevelMultipliers[level][levelId] = 0.0;
      cout << "Setting alpha to zero: " << alpha << " " << level << " " << levelIdBest << " "
	   <<  knnLevelMultipliers[level][levelIdBest] << endl;
      alphaVec[levelIdBest] = 0.0;
      string outFileName = writeDir + knnOrigFileNames[idSet[iiBest]];
      // check if file exists:
      ifstream checkFile(outFileName.c_str());
      if (!checkFile) {
	checkFile.close(); // do nothing
      }
      else {
	checkFile.close();
	string movedName = outFileName + ".bak";
	if (verboseLevel > 0) {
	  cout << "Clearing node and renamming data of node " 
	       << idSet[iiBest] + 1 << " with size " << knnNodes[idSet[iiBest]].size()
	       << " to file " << outFileName << " " << movedName << endl;
	}
	rename(outFileName.c_str(), movedName.c_str());
	// 	ofstream outFile(outFileName.c_str());
	// 	ERROR_IF(!outFile, "Error opening output file!");
	// 	knnNodes[idSet[ii]].writeData(outFile);
	//	outFile.close();
      }
    }
    cout << "Result one boost iteration " << bestMisClassNum << " nodes misclassified, alpha: "
	 << alpha << " epsilon: " << epsilon << " levelId: " << levelIdBest + 1 << endl;
    
  }
  cout << "Weights of level (no normalization): " << alphaVec;
  cout << "Names of level: " << nameVec;
  cout << "Good weights of level (no normalization): " << goodAlpha;
  cout << "Good names of level: " << goodNames;
//   ERROR_IF(elementSum(knnLevelMultipliers[level]) == 0.0, 
// 	   "Vector sum must be non-zero!");
//   // probabilityNormalize(knnLevelMultipliers[level]);
//   cout << "Weights of level after normalization: " << knnLevelMultipliers[level] << endl;
//     // knnNodes[knnId].optimizeScaling(numSteps, verboseLevel, stepWidth, numTrials);
  cout << "Finished bootstrap level!" << endl;
  return alphaVec;
}


/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNet2::boost(const Vec<Vec<double> >& rawData,
	       const Vec<unsigned int>& rawClasses,
	       int numSteps,
	       int thinK,
	       int verboseLevel,
	       double stepWidth,
	       unsigned int numTrials,
	       unsigned int simMax,
	       double cutoff,
	       const string& writeDir)
{
  PRECOND(rawData.size() == rawClasses.size());
  cout << "Starting bootstrap with " 
       << rawData.size() << " lines " << endl;
  // what are the training vectors for current level?
  Vec<Vec<double> > levelData(rawData.size()); 
  Vec<double> alphaVec = boostLevel(rawData, rawClasses, 0, numSteps, thinK, verboseLevel, 
		 stepWidth, numTrials, simMax, cutoff, writeDir);
  if (verboseLevel > 0) {
    cout << "Finished generating input data for level " << 1 << endl;
  }
  ERROR_IF(numLevels != 2, "Currently only 2 levels are supported for boosting!");
  // for (unsigned int k = 1; k < numLevels; ++k) { // loop over levels
  ERROR_IF(knnLevelVoteMultipliers[1].size() != 1, 
	   "Currently only 2 levels with sizes n and 1 are supported for boosting!");
  knnLevelVoteMultipliers[1][0] = alphaVec;
//     if (verboseLevel > 0) {
//       cout << "Starting to generate input data for level " << k + 1 << endl;
//     }
//     for (unsigned int i = 0; i < rawData.size(); ++i) {
//       levelData[i] = predictClassProbLevel(rawData[i], k);
//     }
//     if (verboseLevel > 0) {
//       cout << "Finished generating input data for level " << k + 1 << endl;
//     }
//     boostLevel(levelData, rawClasses, k, numSteps, thinK, verboseLevel, 
// 		   stepWidth, numTrials, simMax, cutoff, writeDir);
// }
  if (verboseLevel > 0) {
    cout << "Finished bootstrap!" << endl;
  }
}


/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNet2::optimizeScaling(int numSteps,
			int verboseLevel,
			double stepWidth,
			unsigned int numTrials)
{
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    if (verboseLevel > 0) {
      cout << "Optimizing scaling of node: " << i + 1 << "  numSteps: "
	   << numSteps << " stepWidth: " << stepWidth 
	   << " numTrials: " << numTrials << endl;
    }
    knnNodes[i].optimizeScaling(numSteps, verboseLevel, 
				    stepWidth, numTrials);
  }
}

/** applies thinning operation to all nodes */
void
KnnNet2::thin(unsigned int thinK)
{
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    knnNodes[i].thin(thinK);
  }
}

Vec<double>
KnnNet2::getLastNodePredictions(unsigned int dim) const 
{
  Vec<double> result(knnNodes.size());
  for (unsigned int i = 0; i < knnNodes.size(); ++i) {
    result[i] = knnNodes[i].getLastPrediction()[dim];
  }
  return result;
}
