package MatcherSpVar;

import java.nio.file.*;
import java.nio.charset.*;
import static java.util.stream.Collectors.*;

import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import gov.nih.nlm.nls.lexCheck.CheckCont.*;
import gov.nih.nlm.nls.lvg.Api.*;

import Util.*;
/*****************************************************************************
* This class is to group spVars by (same) metaphone, a specified edit distance,
* and the min. sorted distance from a SpVars table.
*
* This class is used in the AMIA paper init submission.
*
* Input: norm form|SpVar 1|SpVar 2|SpVar 3|...
* Output-1: base form|SpVar 1|SpVar 2|SpVar 3|...
*
* Combine folowing:
* <ul>
* <li>same metaphone form (-f:m) 
* <li>edit distance restriction 2: 
* <li>min. sorted distance:
* <p>
* <li>Example: "anemia|anaemia", [ANM], Ed: 1
* <li>Example: "anemic|anaemic", [ANMK], Ed: 1
* <li>Example: "abortigenic|abortogenic", [ABRTJNK], Ed: 1
* <li>Example: "lamictal|lamiktal", [LMKTL], Ed: 1
* <p>
* <li>Example: "yuppie|yuppy", [YP], Ed: 2
* <li>Example: "yuppie flu|yuppy flu", [YPFL], Ed: 2
* <li>Example: "lamellose|lamellous", [LMLS], Ed: 2
* <li>Example: "zoril|zorilla|zorille|zorillo", [SRL], Ed: 2-1
* <p>
* <li>Example: "anemia|anemic" should not include, different metaphone
* </ul>
*
* @author NLM NLS Development Team
*
* @version    V-2014
*****************************************************************************/
public class GroupSpVarByMES
{
	// private constructor
	private GroupSpVarByMES()
	{
	}

	public static void main(String[] args)
	{
		String topDir =
			"/nfsvol/lex/Lu/Development/Lexicon/Components/Multiwords/";
		String year = "2016";
		int maxEditDist = 2;
		String inFile = topDir + "data/" + year
			+ "/outData/08.MatcherSpVar/UnitTest/LRSPL.data.1.byNorm.out";
		String outFile = topDir + "data/" + year
			+ "/outData/08.MatcherSpVar/UnitTest/LRSPL.data.2.byMES.2.out";

		if(args.length == 3)
		{
			inFile = args[0];
			outFile = args[1];
			maxEditDist = Integer.parseInt(args[2]);
		}
		else if(args.length > 0)
		{
			System.out.println("Usage: java GroupSpVarByMES <inFile> <outFile> <maxEditDist>");
			System.exit(0);
		}

		// read in input
		HashMap<String, HashSet<String>> inBaseSpVarsMap 
			= ReadInSpVars.ReadNormSpVarsFromFile(inFile);

		// test case:
		HashMap<String, ArrayList<String>> baseSpVarsMapMES 
			= GroupByMES(inBaseSpVarsMap, maxEditDist);

		// print out
		PrintOutSpVars.PrintOutSpVarGroupsList(baseSpVarsMapMES, outFile);
	}

	// public methods
	// Example: yuppie|yuppy (not yup), saree|sari (not SarA)
	// anemia|anaemia (not anemic)
	// check the following:
	// 1. same metaphone form
	// 2. Edit distance <= 2
	// 3. the one with min. sorted distance
	public static HashMap<String, ArrayList<String>> GroupByMES(
		HashMap<String, HashSet<String>> inBaseSpVarsMap, int maxEditDist)
	{
		System.out.println("===== GroupByMES =====");
		System.out.println("-- Input inBaseSpVarsMap key size: "
			+ inBaseSpVarsMap.keySet().size());

		// pre-process: establish following tables and list
		// 1. singleSpVarList: single spVar without spVar list
		ArrayList <String> singleSpVarList = new ArrayList<String>();
		// 2. sortedSpVarList: all spVars, sorted, used for sorted distance
		ArrayList<String> sortedSpVarList = new ArrayList<String>();
		// 3. baseSpVarsMap: base|spVars, the unsorted results
		HashMap<String, ArrayList<String>> baseSpVarsMap
			= new HashMap<String, ArrayList<String>>();
		// 4. spVarBaseMap: spVar|base (used as index to find where to add)
		HashMap<String, String> spVarBaseMap = new HashMap<String, String>();
		// 5. mpSpVarsMap: metaphone|spVars
		HashMap<String, HashSet<String>> mpSpVarsMap
			= new HashMap<String, HashSet<String>>();

		// go through collections by key(base)|SpVars
		Iterator<String> iter = inBaseSpVarsMap.keySet().iterator();
		while(iter.hasNext())
		{
			String key = iter.next();
			ArrayList<String> values
				= new ArrayList<String>(inBaseSpVarsMap.get(key));

			// 1. singleSpVarList: no spVars	
			if(values.size() == 1)
			{
				String singleSpVar = values.get(0);
				singleSpVarList.add(singleSpVar);
			}

			// 2. sorted all SpVars: used for calcualting sorted distance
			sortedSpVarList.addAll(values); // need to be sorted when finished

			// 3. base|spVars
			Collections.sort(values, bc_);  // sort by base comparator
			String base = values.get(0);    // base is the first in SpVars
			baseSpVarsMap.put(base, values);

			// 4. spVar|base (use as index to find targetSpVar to add)
			for(String value:values)
			{
				spVarBaseMap.put(value, base);
			}

			// 5. metaphone hash map table: metaphone|spVars 
			for(String spVar:values)
			{
				// put all spVars with same metaphone together
				String metaphone 
					= Metaphone.GetMetaphone(spVar, maxCodeLength_);
				if(mpSpVarsMap.containsKey(metaphone) == true)
				{
					mpSpVarsMap.get(metaphone).add(spVar);
				}
				else	// new Metaphone
				{
					HashSet<String> spVars = new HashSet<String>();
					spVars.add(spVar);
					mpSpVarsMap.put(metaphone, spVars);
				}
			}
		}

		// 2-b. sorted singleSpVarList  and sortedSpVarList: spVar|index
		Collections.sort(singleSpVarList);
		Collections.sort(sortedSpVarList);

		System.out.println("====== Complete Pre-Process ======");
		System.out.println("-- No. of single SpVar: " + singleSpVarList.size());
		System.out.println("-- No. of total sorted SpVars: " 
			+ sortedSpVarList.size());
		System.out.println("-- No. of key for spVar|base table (same no. as above): " 
			+ spVarBaseMap.keySet().size());
		System.out.println("-- No. of key for base|SpVar table: " 
			+ baseSpVarsMap.keySet().size());
		System.out.println("-- No. of key for Metaphone|spVar table: " 
			+ mpSpVarsMap.keySet().size());

		// II. Process:
		// Go through all sorted single spVars to add them to spVars:
		// find targetSpVars for single spVar with:
		// same metaphone
		// within specified edit distance (< 2)
		// min. sorted distance (maybe < 100?) from all tagetSpVar candidates
		boolean caseSensitive = false;
		int combineNo = 0;
		int sLineNo = 0;
		for(String singleSpVar:singleSpVarList)
		{
			sLineNo++;
			// 1. Metaphone: find spVars with same metaphone
			String singleSpVarMp = Metaphone.GetMetaphone(singleSpVar, 
				maxCodeLength_);
			HashSet<String> sameMpSpVarSet = mpSpVarsMap.get(singleSpVarMp);

			// 2. Edit Distance: go through spVars with same metaphone to find
			// 2. check if edit distance < 3 (max = 2), add to withinEdSpVarList
			// 2.1 exclude itself
			ArrayList<String> withinEdSpVarList = new ArrayList<String>();
			for(String sameMpSpVar:sameMpSpVarSet)
			{
				if((singleSpVar.equals(sameMpSpVar) == false) // not itself
				&& (EditDistance.IsLegalEditDistanace(singleSpVar, sameMpSpVar, 
					caseFlag_, maxEditDist) == true))
				{
					withinEdSpVarList.add(sameMpSpVar);
				}
			}

			// 3. Sorted distance: find the one with least sorted distance
			String targetSpVar = null;	// the target spVar to add to
			if(withinEdSpVarList.size() > 0)
			{
				targetSpVar = withinEdSpVarList.get(0);
				int minSd = SortedDistance.GetSortedDistance(
					singleSpVar, targetSpVar, sortedSpVarList);
				for(int i = 1; i < withinEdSpVarList.size(); i++)
				{
					String curSpVar = withinEdSpVarList.get(i);
					int curSd = SortedDistance.GetSortedDistance(
						singleSpVar, curSpVar, sortedSpVarList);
						
					if(curSd != -1)	// exclude error, should not happen
					{
						targetSpVar = ((curSd < minSd)?curSpVar:targetSpVar);
					}
					else
					{
						System.err.println("** ERR@GroupSpVarByMES( ): min. sorted distance, no curSd");
					}
				}
			}

			// 4. combine singleSpVar to targetSpVar in result 
			// and remove singleSpVpVarBaseMapr from result
			if(targetSpVar != null)
			{
				// update baseSpVarsMap
				String targetKey = spVarBaseMap.get(targetSpVar);
				ArrayList<String> targetValues = baseSpVarsMap.get(targetKey);
				String sourceKey = spVarBaseMap.get(singleSpVar);
				ArrayList<String> sourceValues = baseSpVarsMap.get(sourceKey);

// TBD
/**
if(sourceValues.size() > 1)
{
	System.out.println(sourceValues);
}
**/

				// add source singleSpVar to targetSpVar
				if(targetValues.contains(singleSpVar) == false)
				{
					combineNo++;
					// update baseSpVarsMap
					baseSpVarsMap.get(targetKey).addAll(sourceValues);
					baseSpVarsMap.remove(sourceKey);	//remove org singleSpVar

					// update spVarBaseMap
					for(String srcValue: sourceValues)
					{
						spVarBaseMap.replace(srcValue, targetKey);
					}
				}
			}

			// status
			if((sLineNo%100000) == 0)
			{
				System.out.println("-- MES: " + sLineNo);
			}
		}

		// final formating by sorted on spVars
		int noSpVarGroupNo = 0;
		int spVarGroupNo = 0;
		HashMap<String, ArrayList<String>> baseSpVarsMapMES
			= new HashMap<String, ArrayList<String>>();
		Iterator<String> it = baseSpVarsMap.keySet().iterator();
		while(it.hasNext())
		{
			String key = it.next();
			ArrayList<String> values = baseSpVarsMap.get(key);

			if(values.size() == 1)
			{
				noSpVarGroupNo++;
			}
			else
			{
				spVarGroupNo++;
			}

			// assign to outBaseSpVars
			ArrayList<String> sortedValues = new ArrayList<String>(values);
			Collections.sort(sortedValues, bc_);  // sort by base comparator
			String base = sortedValues.get(0);
			baseSpVarsMapMES.put(base, sortedValues);
		}

		System.out.println("====== Complete Process - Results ======");
		System.out.println(
			"-- Total no of single spVars combined by MES: "
			+ combineNo);
		System.out.println("-- Total group no of out baseSpVarsMapMES: "
			+ baseSpVarsMapMES.keySet().size());
		System.out.println("-- Total No. of non-single spVars classes: "
			+ spVarGroupNo);
		System.out.println("-- Total No. of single spVars classes: "
			+ noSpVarGroupNo);

		return baseSpVarsMapMES;
	}

	// private methods

	// data member
	private static BaseComparator<String> bc_ = new BaseComparator<String>();
	private static boolean caseFlag_ = false;
	private static int maxCodeLength_ = 10;
}
