// --*- C++ -*------x---------------------------------------------------------
// $Id: myuniq.cc,v 1.1.1.1 2006/07/03 14:43:20 bindewae Exp $
//
// Program:         - 
//
// Author:          Eckart Bindewald
//
// Project name:    -
//
// Date:            $Date: 2006/07/03 14:43:20 $
//
// Description:     
// 
// -----------------x-------------------x-------------------x-----------------

#define __STL_NO_DRAND48

#include <iostream>
#include <fstream>
#include <string>
#include <Vec.h>
#include <debug.h>
#include <GetArg.h>
#include <FileName.h>
#include <vectornumerics.h>
#include <clusterAlgorithms.h>
#include <Random.h>
#include <algorithm>

void
helpOutput(ostream& os)
{
  os << "clustervectors2 -i inputformat -d cutoff " << endl
     << "--mask id1 id2 ...   : input column ids" << endl
     << "--max value  : maximum size of cluster" << endl
     << "--min value  : minimum size of cluster" << endl
     << "--root filename : root directory" << endl
     << "--sim n         : maximally this many similar rows in a row" << endl
     << "--unique 0|1    : unique data only" << endl
     << "--verbose level : verbose level"  << endl;  
}

/** output of command line parameter with which the program was called. */
void
parameterOutput(ostream& os, int argc, char** argv)
{
  for (int i = 0; i < argc; i++)
    {
      os << argv[i] << " ";
    }
  os << endl;
}

Vec<Vec<double> >
generateDistanceMatrix(const Vec<Vec<double> >& dataVecs) 
{
  Vec<double> distanceRow(dataVecs.size(), 0.0);
  Vec<Vec<double> > distances(dataVecs.size(), distanceRow);
  for (unsigned int i = 1; i < dataVecs.size(); ++i) {
    for (unsigned int j = 0; j < i; ++j) {
      distances[i][j] = euclidianDistance(dataVecs[i], dataVecs[j]);
      distances[j][i] = distances[i][j];
    }
  }
  return distances;
}

Vec<Vec<unsigned int> >
clusterVectors(const Vec<Vec<double> >& dataVecs,
	       double cutoff,
	       int clusterAlgorithm)
{
  Vec<Vec<unsigned int> > result;
  switch (clusterAlgorithm) {
  case 0:  // trivial clustering
    result = Vec<Vec<unsigned int> >(dataVecs.size());
    for (unsigned int i = 0; i < dataVecs.size(); ++i) {
      result[i] = Vec<unsigned int>(1, i);
    }
    break;
  case 1:
    result = simpleRepresentativeLinkage(dataVecs, cutoff); 
    break;
  default:
    ERROR("Unkonwn cluster algorithm!");
  }
  return result;
}

/** reads input file, returns table with each row of string corresponding to 
    one line of the input stream
*/
Vec<Vec<string> >
readStringTable(istream& is)
{
  Vec<Vec<string> > result;
  while (is) {
    string line = getLine(is);
    vector<string> words = getTokens(line);
    if (words.size() > 0) { // ignore lines with only whitespace
      result.push_back(words);
    }
  }
  return result;
}

Vec<Vec<string> >
readStringData(istream& is, int dataFormat)
{
  Vec<Vec<string> > result;
  switch (dataFormat) {
  case 0: is >> result;
    break;
  case 1:
    result = readStringTable(is);
    break;
  default:
    ERROR("Unknown input data format!");
  }
  return result;
}

Vec<double>
convertStringsToDoubleData(const Vec<string>& dataWords,
			   const Vec<unsigned int>& mask)
{
  PRECOND(mask.size() > 0);
  Vec<double> result(mask.size());
  for (unsigned int i = 0; i < mask.size(); ++i) {
    if(mask[i] >= dataWords.size()) {
      cout << "Insuficient number of columns in data row!" << endl;
      cout << dataWords << endl << mask << endl;
      exit(1);
    }
    result[i] = stod(dataWords[mask[i]]);
  }
  return result;
}

Vec<Vec<double> >
convertStringsToDoubleData(const Vec<Vec<string> >& dataWords,
			   Vec<unsigned int> mask)
{
  ERROR_IF((dataWords.size() == 0) || (dataWords[0].size() == 0),
	   "Could not find suitable first data line.");
  unsigned int numCol = dataWords[0].size();
  unsigned int goodCount = 1;
  for (unsigned int i = 1; i < dataWords.size(); ++i) {
    if (dataWords[i].size() == numCol) {
      ++goodCount;
    }
  }
  Vec<Vec<double> > result(goodCount);
  if (mask.size() == 0) {
    mask = generateStair(numCol);
  }
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataWords.size(); ++i) {
    if (dataWords[i].size() != numCol) {
      continue;
    }
    result[pc++] = convertStringsToDoubleData(dataWords[i], mask);
  }
  return result;
}

int
convertStringsToClassData(const Vec<string>& dataWords,
			  unsigned int dataCol,
			  Vec<string>& wordsSoFar)
{
  ERROR_IF(dataCol >= dataWords.size(), "Undefined class column data!");

  const string& newWord = dataWords[dataCol];
  for (unsigned int i = 0; i < wordsSoFar.size(); ++i) {
    if (wordsSoFar[i].compare(newWord) == 0) {
      return static_cast<int>(i);
    }
  }
  wordsSoFar.push_back(newWord);
  return static_cast<int>(wordsSoFar.size()-1);
}

Vec<int>
convertStringsToClassData(const Vec<Vec<string> >& dataWords,
			  unsigned int classCol)
{
  ERROR_IF((dataWords.size() == 0) || (dataWords[0].size() == 0),
	   "Could not find suitable first data line.");
  if (classCol == 0) {
    classCol == dataWords[0].size()-1;
  }
  unsigned int numCol = dataWords[0].size();
  unsigned int goodCount = 1;
  for (unsigned int i = 1; i < dataWords.size(); ++i) {
    if (dataWords[i].size() == numCol) {
      ++goodCount;
    }
  }
  Vec<int> result(goodCount);
  Vec<string> wordsSoFar;
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataWords.size(); ++i) {
    if (dataWords[i].size() != numCol) {
      continue;
    }
    result[pc++] = convertStringsToClassData(dataWords[i], classCol, wordsSoFar);
  }
  return result;
}

void
writeStringTable(ostream& os, 
		 const Vec<Vec<string> >& dataWords,
		 const Vec<Vec<unsigned int> >& clusters,
		 int clusterSizeMax,
		 int clusterSizeMin)
{
  Random& rnd = Random::getInstance();
  for (unsigned int i = 0; i < clusters.size(); ++i) {
    if (static_cast<int>(clusters.size()) < clusterSizeMin) {
      continue;
    }
    unsigned int maxOut = clusters[i].size();
    if (clusterSizeMax > 0) {
      maxOut = minimum(maxOut, static_cast<unsigned int>(clusterSizeMax));
    }
    Vec<unsigned int> rids = generateStair(maxOut);
    random_shuffle(rids.begin(), rids.end(), rnd);
    for (unsigned int jj = 0; jj < maxOut; ++jj) {
      unsigned int j = rids[jj]; // random index
      unsigned int id = clusters[i][j];
      for (unsigned int k = 0; k < dataWords[id].size(); ++k) {
	os << dataWords[id][k] << " ";
      }
      os << endl;
    }
  }
}


void
writeStringTable(ostream& os, 
		 const Vec<Vec<string> >& dataWords,
		 const Vec<unsigned int>& clusters)
{
  for (unsigned int i = 0; i < clusters.size(); ++i) {
    unsigned int id = clusters[i];
    for (unsigned int k = 0; k < dataWords[id].size(); ++k) {
      os << dataWords[id][k] << " ";
    }
    os << endl;
  }
}


void
writeData(ostream& os, 
	  const Vec<Vec<string> >& dataWords,
	  const Vec<Vec<unsigned int> >& clusters,
	  int outputFormat, 
	  int clusterSizeMax,
	  int clusterSizeMin)
{
  switch (outputFormat) {
  case 1: // same as input format "1":
    writeStringTable(os, dataWords, clusters, clusterSizeMax,
		     clusterSizeMin);
    break;
  default:
    ERROR("Unknown output format!");
  }
}

void
writeData(ostream& os, 
	  const Vec<Vec<string> >& dataWords,
	  const Vec<unsigned int>& ids,
	  int outputFormat)
{
  switch (outputFormat) {
  case 1: // same as input format "1":
    writeStringTable(os, dataWords, ids);
    break;
  default:
    ERROR("Unknown output format!");
  }
}

/** returns true if two datavectors are more similar (abs norm) than cutoff */
/*
bool
isDataVecSimilar(const Vec<double>& dataVec,
		 const Vec<double>& lastVec, 
		 const Vec<unsigned int>& mask, 
		 double cutoff)
{
  if (dataVec.size() != lastVec.size()) {
    return false;
  }
  double sum = 0.0;
  for (unsigned int i = 0; i < mask.size(); ++i) {
    sum += fabs(dataVec[mask[i]]-lastVec[mask[i]]);
    if (sum > cutoff) {
      return false;
    }
  }
  return true;
}
*/

void
processFile(istream& is, 
	    Vec<unsigned int> mask,
	    unsigned int maskMode,
	    unsigned int simMax,
	    double cutoff) 
{
  Vec<double> dataVec, lastVec;
  string line;
  Vec<string> lastLines;
  Vec<unsigned int> ranIndices;
  while (is) {
    line = getLine(is);
    Vec<string> words = getTokens(line);
    if (words.size() == 0) {
      continue;
    }
    if (mask.size() == 0) {
      if (maskMode == 1) {
	mask = generateStair(words.size());
      }
      else if (maskMode == 2) { 
	mask = generateStair(words.size()-1);
      }
    }
    dataVec = convertStringsToDoubleData(words, mask);
    //     cout << "Mask : " << mask << endl << "data: " << dataVec << endl
    // 	 << " last vec: " << lastVec << endl;
    if ((lastLines.size() != 0) && (!isDataVecSimilar(dataVec, lastVec, mask, cutoff))) {
      // lastLines.push_back(line);
      unsigned int mini = simMax;
      if (lastLines.size() < simMax) {
	mini = lastLines.size();
      }
      // output of random sample
      ranIndices = generateRandomIndexSubset(mini, lastLines.size(), 0);
      for (unsigned int j = 0; j < ranIndices.size(); ++j) {
	cout << lastLines[ranIndices[j]] << endl;
      }
      lastLines.clear();
    }
    lastLines.push_back(line);
    lastVec = dataVec;
  }
  // if buffer still full at end of file:
  if (lastLines.size() > 0) {
    unsigned int mini = simMax;
    if (lastLines.size() < simMax) {
      mini = lastLines.size();
    }
    // output of random sample
    ranIndices = generateRandomIndexSubset(mini, lastLines.size(), 0);
    for (unsigned int j = 0; j < ranIndices.size(); ++j) {
      cout << lastLines[ranIndices[j]] << endl;
    }
    lastLines.clear();
  }
}

int
main(int argc, char ** argv)
{
  bool helpMode;
  int argcFile = 0;
  int clusterAlgorithm = 1; // 0: do nothing, 1: simpleRepresentative
  int clusterSizeMax = -1; // if > 0: restrict maximum size of output cluster
  int clusterSizeMin = 1; // minimum size of output cluster
  // int inputFormat = 1; // 0: native format, 1: simple list
  int kk = 0; // k for k nearest neighbors
  int knn = -1; // run knn algorithm on which data point
  // int outCountMode = 1;
  // int outputFormat = 1; // 0: native format, 1: simple list
  int maskMode = 1;
  int uniqueMode = 0;
  char ** argvFile = 0;
  unsigned int classCol = 99999; // large number, user has to specify
  unsigned int simMax = 1; // maximally this many similar rows in a row
  unsigned int verboseLevel = 0;
  double cutoff = 0.1;
  string commandFileName;
  string inputFileName;
  string logFileName; //  = "mainprogramtemplate.log";
  string rootDir = ".";
  Vec<unsigned int> mask;
  getArg("-help", helpMode, argc, argv);

  if ((argc < 2) || helpMode)  {
    helpOutput(cout);
    exit(0);
  }

  getArg("-root", rootDir, argc, argv, rootDir);
  addSlash(rootDir);

  getArg("-commands", commandFileName, argc, argv, commandFileName);
  addPathIfRelative(commandFileName, rootDir);

  if (commandFileName.size() > 0) {
    ifstream commandFile(commandFileName.c_str());
    if (!commandFile) {
      if (isPresent("-commands", argc, argv)) {
	ERROR_IF(!commandFile, "Error opening command file.");
      }
      else {
	cerr << "Warning: Could not find command file: " + commandFileName 
	     << endl;
      }
    }
    else {
      argvFile = streamToCommands(commandFile, argcFile, 
				  string("mainprogramtemplate"));
    }
    commandFile.close();
  }

  getArg("a", clusterAlgorithm, argcFile, argvFile, clusterAlgorithm);
  getArg("a", clusterAlgorithm, argc, argv, clusterAlgorithm);
  getArg("-class-col", classCol, argcFile, argvFile, classCol); // defines used class columns
  getArg("-class-col", classCol, argc, argv, classCol);
  --classCol; // internal counting starts at zero, external at one
  getArg("d", cutoff, argcFile, argvFile, cutoff);
  getArg("d", cutoff, argc, argv, cutoff);
  getArg("i", inputFileName, argc, argv, inputFileName);
  getArg("k", kk, argcFile, argvFile, kk);
  getArg("k", kk, argc, argv, kk);
  getArg("-knn", knn, argcFile, argvFile, knn);
  getArg("-knn", knn, argc, argv, knn);
  --knn;
  getArg("-log", logFileName, argc, argv, logFileName);
  getArg("-log", logFileName, argcFile, argvFile, logFileName);
  addPathIfRelative(logFileName, rootDir);
  getArg("-mask", mask, argcFile, argvFile); // defines used data columns
  getArg("-mask", mask, argc, argv);
  convert2InternalCounting(mask); // internal counting starts at zero
  getArg("-mask-mode", maskMode, argcFile, argvFile, maskMode); // defines used data columns
  getArg("-mask-mode", maskMode, argc, argv, maskMode);
  getArg("-max", clusterSizeMax, argcFile, argvFile, clusterSizeMax);
  getArg("-max", clusterSizeMax, argc, argv, clusterSizeMax);
  getArg("-min", clusterSizeMin, argcFile, argvFile, clusterSizeMin);
  getArg("-min", clusterSizeMin, argcFile, argvFile, clusterSizeMin);
  getArg("-sim", simMax, argcFile, argvFile, simMax);
  getArg("-sim", simMax, argc, argv, simMax);
  getArg("-unique", uniqueMode, argcFile, argvFile, uniqueMode);
  getArg("-unique", uniqueMode, argc, argv, uniqueMode);
  getArg("-verbose", verboseLevel, argcFile, argvFile, verboseLevel);
  getArg("-verbose", verboseLevel, argc, argv, verboseLevel);

  if (logFileName.size() > 0) {
    ofstream logFile(logFileName.c_str(), ios::app);
    parameterOutput(logFile, argc, argv);
    if (argcFile > 1) {
      logFile << "Parameters from command file: ";
      parameterOutput(logFile, argcFile, argvFile);
    }
    logFile.close();
  }
  
  /***************** MAIN PROGRAM *****************************/

  if (inputFileName.size() > 0) {
    ifstream inputFile(inputFileName.c_str());
    ERROR_IF(!inputFile, "Error opening input file!");
    processFile(inputFile, mask, maskMode, simMax, cutoff);
    inputFile.close();
  }
  else {
    processFile(cin, mask, maskMode, simMax, cutoff);
  }

  return 0;
}
