package de.xam.texthtml.text;

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.xydra.index.IIntegerRangeIndex;
import org.xydra.index.impl.IntegerRangeIndex;

/**
 * Definitions of 'whitespace', 'text' and 'mixed' (can occur in text and in white-space). For completeness, a fourth
 * category is 'other' (reserved, unassigned, surrogate).
 *
 * @author xamde
 */
public class Unicodes {

	public enum KindOfCharacter {
		Digit, Lowercase, None, Uppercase
	}

	private static final IIntegerRangeIndex BROKEN_UTF8;

	/**
	 * Unicode characters that could both be part of a name or part of spacing. Note they include neither
	 * {@link #unicodePureText} nor {@link #unicodePureSeparator}.
	 *
	 * Example '-' or '_'.
	 */
	public static final IntegerRangeIndex unicodeMixed = new IntegerRangeIndex();

	public static final IntegerRangeIndex unicodeOther = new IntegerRangeIndex();

	/**
	 * Characters that should be removed in 'trim' operations, includes whitespace, line-breaks, control characters,
	 */
	public static final IntegerRangeIndex unicodePureSeparator = new IntegerRangeIndex();

	/**
	 * {@link Character#SPACE_SEPARATOR} union {@link #unicodeMixed} (many classes)
	 */
	public static final IntegerRangeIndex unicodeSeparator_or_mixed = new IntegerRangeIndex();

	public static final IntegerRangeIndex unicodeText_or_mixed = new IntegerRangeIndex();

	/**
	 * Everything that is not: separator, control, unassigned, punctuation
	 */
	public static final IntegerRangeIndex unicodePureText = new IntegerRangeIndex();

	/**
	 * what cannot occur in a usual name, e.g. line breaks. Contains: Unicode block CONTROL, LINE_BREAK and
	 * PARAGRAPH_BREAK
	 */
	public static final IntegerRangeIndex unicodeNoName = new IntegerRangeIndex();

	public static final Charset UTF8 = Charset.forName("UTF-8");

	public static final IntegerRangeIndex unicodeControl = new IntegerRangeIndex();

	/** Legal and not discouraged characters valid in XML 1.0 */
	public static final IntegerRangeIndex legalEncouragedXml10 = new IntegerRangeIndex();

	public static final IntegerRangeIndex unicodeUpperCase = new IntegerRangeIndex();

	public static final IntegerRangeIndex unicodeLowerCase = new IntegerRangeIndex();

	static {
		/** these are the only C0 controls accepted in XML 1.0; */
		legalEncouragedXml10.index(0x0009);
		legalEncouragedXml10.index(0x000A);
		legalEncouragedXml10.index(0x000D);
		/**
		 * this excludes some (not all) non-characters in the BMP (all surrogates, U+FFFE and U+FFFF are forbidden);
		 */
		legalEncouragedXml10.index(0x0020, 0xD7FF);
		legalEncouragedXml10.index(0xE000, 0xFFFD);
		/**
		 * this includes all code points in supplementary planes, including non-characters.
		 */
		legalEncouragedXml10.index(0x10000, 0x10FFFF);

		/**
		 * The preceding code points ranges contain the following controls which are only valid in certain contexts in
		 * XML 1.0 documents, and whose usage is restricted and highly discouraged
		 */
		/** this includes a C0 control character and all but one C1 control. */
		// legalEncouragedXml10.index(0x007F, 0x0084);
		// legalEncouragedXml10.index(0x0086, 0x009F);

	}

	/* initialize the unicode classification ranges */
	static {
		for (int i = 0; i < Character.MAX_CODE_POINT; i++) {

			if(Character.isUpperCase(i)) {
				unicodeUpperCase.index(i);
			}
			if(Character.isLowerCase(i)) {
				unicodeLowerCase.index(i);
			}

			// IMPROVE take a deep look here http://unicode.org/reports/tr14/

			// decide TEXT, SPACE or MIXED.

			final int type = Character.getType(i);
			switch (type) {
			case Character.COMBINING_SPACING_MARK:
			case Character.CURRENCY_SYMBOL:
			case Character.DECIMAL_DIGIT_NUMBER:
			case Character.LETTER_NUMBER:
			case Character.LOWERCASE_LETTER:
			case Character.MODIFIER_LETTER:
			case Character.MODIFIER_SYMBOL:
			case Character.OTHER_LETTER:
			case Character.OTHER_NUMBER:
			case Character.TITLECASE_LETTER:
			case Character.UPPERCASE_LETTER:
				// treat as TEXT
				unicodePureText.index(i);
				break;

			case Character.CONTROL:
				unicodeControl.index(i);
				//$FALL-THROUGH$
			case Character.LINE_SEPARATOR:
			case Character.PARAGRAPH_SEPARATOR:
				unicodeNoName.index(i);
				//$FALL-THROUGH$
			case Character.SPACE_SEPARATOR:
				// treat as SEPARATOR
				unicodePureSeparator.index(i);
				break;

			case Character.MATH_SYMBOL:
			case Character.CONNECTOR_PUNCTUATION:
			case Character.DASH_PUNCTUATION:
			case Character.ENCLOSING_MARK:
			case Character.END_PUNCTUATION:
			case Character.FINAL_QUOTE_PUNCTUATION:
			case Character.FORMAT:
			case Character.INITIAL_QUOTE_PUNCTUATION:
			case Character.NON_SPACING_MARK:
			case Character.OTHER_PUNCTUATION:
			case Character.OTHER_SYMBOL:
			case Character.PRIVATE_USE:
			case Character.START_PUNCTUATION:
				// treat as MIXED
				unicodeMixed.index(i);
				break;

			case Character.SURROGATE:
			case Character.UNASSIGNED:
				// treat as OTHER
				unicodeOther.index(i);
				break;
			}

		}

		// === derived
		unicodeSeparator_or_mixed.addAll(unicodePureSeparator);
		unicodeSeparator_or_mixed.addAll(unicodeMixed);

		unicodeText_or_mixed.addAll(unicodePureText);
		unicodeText_or_mixed.addAll(unicodeMixed);
	}

	static {
		BROKEN_UTF8 = new IntegerRangeIndex();
		BROKEN_UTF8.index(0x80, 0x9F);
	}

	public static boolean isMixed(final int codePoint) {
		return unicodeMixed.isInInterval(codePoint);
	}

	public static boolean isOther(final int codePoint) {
		return unicodeOther.isInInterval(codePoint);
	}

	public static boolean isSeparator(final int codePoint) {
		return unicodePureSeparator.isInInterval(codePoint);
	}

	public static boolean isText(final int codePoint) {
		return unicodePureText.isInInterval(codePoint);
	}

	public static boolean looksLikeBrokenUTF8(final String s) {
		return IntegerRangeIndex.isAnyCharacterInIntervals(BROKEN_UTF8, s);
	}

	/**
	 * Classify a codePoint into pure text, pure separator or the mixed group.
	 *
	 * @param codePoint
	 * @return
	 */
	public static char toBroadUnicodeCategory(final int codePoint) {
		if (isText(codePoint)) {
			return 'T';
		}
		if (isSeparator(codePoint)) {
			return '_';
		}
		if (isMixed(codePoint)) {
			return 'm';
		}
		if (isOther(codePoint)) {
			return '?';
		}

		throw new AssertionError("unicode interval setup is not complete");
	}

	/**
	 * @param codepoint
	 * @return a mode based on Unicode character classes
	 */
	public static KindOfCharacter getKindOfCharacter(final int codepoint) {
		if (Character.isLowerCase(codepoint)) {
			return KindOfCharacter.Lowercase;
		}
		if (Character.isUpperCase(codepoint)) {
			return KindOfCharacter.Uppercase;
		}
		if (Character.isDigit(codepoint)) {
			return KindOfCharacter.Digit;
		}
		return KindOfCharacter.None;
	}

	public static final String[] CATEGORY_CODE = { "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl",
			"No", "Zs", "Zl", "Zp", "Cc", "Cf", "??NoCode", "Co", "Cs", "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk",
			"So", "Pi", "Pf" };

	public static final String[] CATEGORY_NAMES = { "UNASSIGNED", "UPPERCASE_LETTER", "LOWERCASE_LETTER",
			"TITLECASE_LETTER", "MODIFIER_LETTER", "OTHER_LETTER", "NON_SPACING_MARK", "ENCLOSING_MARK",
			"COMBINING_SPACING_MARK", "DECIMAL_DIGIT_NUMBER", "LETTER_NUMBER", "OTHER_NUMBER", "SPACE_SEPARATOR",
			"LINE_SEPARATOR", "PARAGRAPH_SEPARATOR", "CONTROL", "FORMAT", "??NoCategory", "PRIVATE_USE", "SURROGATE",
			"DASH_PUNCTUATION", "START_PUNCTUATION", "END_PUNCTUATION", "CONNECTOR_PUNCTUATION", "OTHER_PUNCTUATION",
			"MATH_SYMBOL", "CURRENCY_SYMBOL", "MODIFIER_SYMBOL", "OTHER_SYMBOL", "INITIAL_QUOTE_PUNCTUATION",
			"FINAL_QUOTE_PUNCTUATION"

	};

	public static void dumpAnalysisCodepoint(final int codepoint) {
		final int type = Character.getType(codepoint);
		System.out.println("Codepoint=" + codepoint

				+ " Name='" + Character.getName(codepoint) + "'"

		+ " Type " + type + "=" + CATEGORY_CODE[type] + " " + CATEGORY_NAMES[type]

				+ " Dir=" + Character.getDirectionality(codepoint)

				+ " Char='" + (char) codepoint + "'");

	}

	public static void main(final String[] args) {

		final Iterator<Entry<Integer, Integer>> rIt = Unicodes.unicodeControl.rangesIterator();
		while (rIt.hasNext()) {
			final Map.Entry<Integer, Integer> e = rIt.next();
			for (int i = e.getKey(); i <= e.getValue(); i++) {
				dumpAnalysisCodepoint(i);
			}
		}

	}

	/**
	 * @param c
	 * @return true if this is a low or high surrogate
	 */
	public static boolean isSurrogate(final char c) {
		return 0xD800 <= c && c <= 0xDFFF;
	}

	/**
	 * @return if this is a leading/high surrogate pair
	 */
	public static boolean isHighSurrogate(final char c) {
		return 0xD800 <= c && c <= 0xDBFF;
	}

	/**
	 * @return if this is a trailing/low surrogate pair
	 */
	public static boolean isLowSurrogate(final char c) {
		return c >= 0xDC00 && c <= 0xDFFF;
	}

	/**
	 * <pre>
	 * Source: http://unicode.org/reports/tr14/ Table 1. Line Breaking Classes
	 *
	 * Class | Descriptive Name | Examples | Behavior
	 *
	 * Non-tailorable Line Breaking Classes
	 *
	 * BK	Mandatory Break	NL, PS	Cause a line break (after)
	 * CR	Carriage Return	CR	Cause a line break (after), except between CR and LF
	 * LF	Line Feed	LF	Cause a line break (after)
	 * CM	Combining Mark	Combining marks, control codes	Prohibit a line break between the character and the preceding character
	 * NL	Next Line	NEL	Cause a line break (after)
	 * SG	Surrogate	Surrogates	Do not occur in well-formed text
	 * WJ	Word Joiner	WJ	Prohibit line breaks before and after
	 * ZW	Zero Width Space	ZWSP	Provide a break opportunity
	 * GL	Non-breaking (“Glue”)	CGJ, NBSP, ZWNBSP	Prohibit line breaks before and after
	 * SP	Space	SPACE	Enable indirect line breaks
	 * Break Opportunities
	 *
	 * B2	Break Opportunity Before and After	Em dash	Provide a line break opportunity before and after the character
	 * BA	Break After	Spaces, hyphens	Generally provide a line break opportunity after the character
	 * BB	Break Before	Punctuation used in dictionaries	Generally provide a line break opportunity before the character
	 * HY	Hyphen	HYPHEN-MINUS	Provide a line break opportunity after the character, except in numeric context
	 * CB	Contingent Break Opportunity	Inline objects	Provide a line break opportunity contingent on additional information
	 * Characters Prohibiting Certain Breaks
	 *
	 * CL	Close Punctuation	“}”, “❳”, “⟫” etc.	Prohibit line breaks before
	 * CP	Close Parenthesis	“)”, “]”	Prohibit line breaks before
	 * EX	Exclamation/
	 * Interrogation	“!”, “?”, etc.	Prohibit line breaks before
	 * IN	Inseparable	Leaders	Allow only indirect line breaks between pairs
	 * NS	Nonstarter	“‼”, “‽”, “⁇”, “⁉”, etc.	Allow only indirect line breaks before
	 * OP	Open Punctuation	“(“, “[“, “{“, etc.	Prohibit line breaks after
	 * QU	Quotation	Quotation marks	Act like they are both opening and closing
	 * Numeric Context
	 *
	 * IS	Infix Numeric Separator	. ,	Prevent breaks after any and before numeric
	 * NU	Numeric	Digits	Form numeric expressions for line breaking purposes
	 * PO	Postfix Numeric	%, ¢	Do not break following a numeric expression
	 * PR	Prefix Numeric	$, £, ¥, etc.	Do not break in front of a numeric expression
	 * SY	Symbols Allowing Break After	/	Prevent a break before, and allow a break after
	 * Other Characters
	 *
	 * AI	Ambiguous (Alphabetic or Ideographic)	Characters with Ambiguous East Asian Width	Act like AL when the resolved EAW is N; otherwise, act as ID
	 * AL	Alphabetic	Alphabets and regular symbols	Are alphabetic characters or symbols that are used with alphabetic characters
	 * CJ	Conditional Japanese Starter	Small kana	Treat as NS or ID for strict or normal breaking.
	 * H2	Hangul LV Syllable	Hangul	Form Korean syllable blocks
	 * H3	Hangul LVT Syllable	Hangul	Form Korean syllable blocks
	 * HL	Hebrew Letter	Hebrew	Do not break around a following hyphen; otherwise act as Alphabetic
	 * ID	Ideographic	Ideographs	Break before or after, except in some numeric context
	 * JL	Hangul L Jamo	Conjoining jamo	Form Korean syllable blocks
	 * JV	Hangul V Jamo	Conjoining jamo	Form Korean syllable blocks
	 * JT	Hangul T Jamo	Conjoining jamo	Form Korean syllable blocks
	 * RI	Regional Indicator	REGIONAL INDICATOR SYMBOL LETTER A .. Z	Keep together, break before and after from others
	 * SA	Complex Context Dependent (South East Asian)	South East Asian: Thai, Lao, Khmer	Provide a line break opportunity contingent on additional, language-specific context analysis
	 * XX	Unknown	Most unassigned, private-use	Have as yet unknown line breaking behavior or unassigned code positions *
	 * </pre>
	 */

}
