/*
 * DParagraph.java
 *
 * Created on April 7, 2005, 2:59 PM
 */

package findstruct;
import java.util.*;

/**Represents an "extended line" or paragraph of text to be de-identified.
 *We pre-process the text to break it into "alphabetic" chunks (words) separated
 *by punctuation.  Thus far, we are not being careful about comprehensiveness
 *of punctuation handling, unicode, etc.  The only punctuation character that
 *is considered to still be part of a word is the apostrophe ('). Other forms
 *of apostrophe, such as the slanted ones, are canonicalized to the straight
 *one.
 *<p>
 *Our internal representation retains all sequences of characters that might be
 *a word; these include anything that is not recognized as blank space or
 *punctuation.  The data may be accessed as a sequence of chunks, where these
 *include all such words and all individual punctuation characters, or a
 *sequence of words, where the punctuation is omitted.  In addition, we can
 *retrieve the indices into the original string of the start and end of each
 *chunk, to permit broader regex matching.
 *<p>
 *Our decision to eliminate spaces from the chunk representation makes it easier
 *to process sequential words, but more difficult to determine, for example, if
 *a line begins with blank space or not.
 *
 * @author Peter Szolovits
 */
public class DParagraph {
    /**
     * The original text from which this DParagraph was constructed.
     */
    String text;
    /**
     * String[] of the non-whitespace chunks that make up the DParagraph
     */
    String[] parts;
    /**
     * int[] indexes into parts to find the i's word; note that not all non-white
     * chunks in parts are words
     */
    int[] wordIndex;// index into parts of i'th word
    /**
     * int[] indexes into parts for the first chunk in each line
     */
    int[] lineIndex;
    /**
     * int[] for each chunk in parts, indexes the initial character in text where that
     * chunk starts
     */
    int[] textIndex;
    
    /**
     * Creates a new instance of DParagraph from a String
     * @param s The String from which the DParagraph is constructed.
     */
    public DParagraph(String s) {
        text = s;
        ArrayList<String> chunks = splitWP();
        // Figure out how many non-whitespace chunks we need to store
        // Each one will also get an index into the text string
        int nParts = 0;
        for (Iterator<String> i = chunks.iterator(); i.hasNext();) {
            if (!isWhite(i.next())) nParts++;
        }
        parts = new String[nParts];
        textIndex = new int[nParts];
        // Figure out how many lines and words we will need,
        // and store the non-whitespace chunks and textIndex
        int wCount=0;
        int lCount=1;   // There is at least one line unless nothing
        int chunkNum=0;
        int textPtr=0;
        boolean lastLineWhite = true;
        //for (Iterator<String> i = chunks.iterator(); i.hasNext();) {
        for (String w: chunks) {
            //String w = i.next();
            if (isWhite(w)) {
                if (isEOL(w)) {
                    lCount++;
                    lastLineWhite = true;
                }
            } else {
                if (!isPunct(w)) wCount++;
                textIndex[chunkNum] = textPtr;
                parts[chunkNum++] = canonicalizeApostrophe(w);
                lastLineWhite = false;
            }
            textPtr += w.length();
        }
        if (lastLineWhite) lCount--;
        wordIndex = new int[wCount];
        int wordNum = 0;
        for (int i = 0; i<parts.length; i++) {
            if (!isPunct(parts[i])) wordIndex[wordNum++]=i;
        }
        lineIndex = new int[lCount];
        int lNum = 0;
        chunkNum = 0;
        boolean nextStartsLine = true;
        for (String w: chunks) {
            if (!isWhite(w)) {
                // It's a word or punctuation; point to it if line start
                if (nextStartsLine) {
                    lineIndex[lNum++] = chunkNum;
                }
                chunkNum++;
                nextStartsLine = false;
            } else if (isEOL(w)) {
                nextStartsLine = true;
            }
        }
    }
    
    /**
     *
     * @return String describing the DParagraph, for debugging
     */
    public String describe() {
        StringBuffer s = new StringBuffer();
        s.append("DParagraph: " + text);
        s.append("\nChunks: ");
        for (String w: parts) {
            s.append(w);
            s.append(",");
        }
        s.append("\nWord index: ");
        for (int i: wordIndex) {
            s.append(i);
            s.append(",");
        }
        s.append("\nLine index: ");
        for (int i: lineIndex) {
            s.append(i);
            s.append(",");
        }
        s.append("\nText index: ");
        for (int i: textIndex) {
            s.append(i);
            s.append(",");
        }
        s.append("\n\n");
        return s.toString();
    }
    
    /** Delims holds the String of characters that are considered to break
     *text into words. These must begin with spaces, followed by line
     *terminators, followed by other punctuation.
     *
     *Delimiters form a natural hierarchy:
     *<ul><li>Delimiters
     *        <ul><li>Blanks
     *                <ul><li>Spaces</li>
     *                    <li>EOL's</li></ul></li>
     *            <li>Others</li></ul></li></ul>
     */
    
    static String delims = " \n\r.,:;-=\"/\\()";
    /**
     * Characters that are considered to be apostrophes, which are retained within
     * words.
     */
    static String apostrophes = "'`";
    static final int whiteN = 3; // first n chars are white-space
    static final int blankN = 1; // first n chars are true blank space
    
    /**
     * Determines if c is a delimiter
     * @param c Character to check
     * @return boolean whether c is a delimiter
     */
    static boolean isDelim(char c) {
        return delims.indexOf(c)>=0;
    }
    
    /**
     * Determines if the String s is a delimiter
     * @param s String to check; it is normally a single-character String.
     * If longer, we use only the first character.
     * @return True if s is a delimiter
     */
    static boolean isDelim(String s) {
        return isDelim(s.charAt(0));
    }
    
    static boolean isPunct(char c) {
        return delims.indexOf(c)>=whiteN;
    }
    
    static boolean isPunct(String s) {
        return isPunct(s.charAt(0));
    }
    
    static boolean isWhite(char c) {
        int place = delims.indexOf(c);
        return -1<place && place<whiteN;
    }
    
    static boolean isWhite(String s) {
        return isWhite(s.charAt(0));
    }
    
    static boolean isEOL(char c) {
        int place = delims.indexOf(c);
        return blankN <= place && place < whiteN;
    }
    
    static boolean isEOL(String s) {
        return isEOL(s.charAt(0));
    }
    
    static boolean isMixed(String s) {
        int ct = Math.abs(numCaps(s));
        return ct > 0 && ct < s.length();
    }
    
    static boolean isCapitalizedMixed(String s) {
        int ct = numCaps(s);
        return ct > 0 && ct < s.length();
    }
    
    static boolean isLower(String s) {
        return numCaps(s)==0;
    }
    
    /**
     * Checks whether a word is all capital letters. Ignores apostrophes
     * and digits.
     */
    static boolean isAllCaps(String s) {
        int ct = numCaps(s);
        int lt = numLetters(s);
        return ct == lt;
    }
    
    /**
     * Counts number of capital letters in String s.
     * Count is negative if first letter is not a capital, but of course
     * zero of none are.
     * @param s String
     * @return int Its absolute value is the number of capital letters in the input. Sign is
     * positive if the first letter is a capital, negative (or zero) otherwise.
     */
    
    static int numCaps(String s) {
        int ans = 0;
        int sgn = -1; // sign positive if first char is cap'd
        for (int i = 0; i<s.length(); i++) {
            char c = s.charAt(i);
            boolean u = Character.isUpperCase(c);
            if (i==0) if (u) sgn=1;
            if (u) ans++;
        }
        return ans*sgn;
    }
    
    static int numApostrophes(String s) {
        int n = 0;
        int i = 0;
        while ((i=s.indexOf(apostrophes.charAt(0), i)) >= 0) {
            n++; i++;
        }
        return n;
    }
    
    static int numLetters(String s) {
        int n = 0;
        for (int i = 0; i < s.length(); i++) {
            if (Character.isLetter(s.charAt(i))) n++;
        }
        return n;
    }
    
    /**
     * Checks whether a String contains an apostrophe
     * @param s The String to check
     * @return True if it contains one of the characters in {@link apostrophes}
     */
    static boolean hasApostrophe(String s) {
        for (int i = 0; i<apostrophes.length(); i++) {
            if (s.indexOf(apostrophes.charAt(i))>=0) return true;
        }
        return false;
    }
    
    /**
     * Converts a String so that all appearances of the non-standard apostrophes
     * (the slanting ones) are replaced by the straight one.
     * @param s String
     * @return String
     */
    static String canonicalizeApostrophe(String s) {
        for (int i = 1; i<apostrophes.length(); i++) {
            s = s.replace(apostrophes.charAt(i), apostrophes.charAt(0));
        }
        return s;
    }
    
    
    
    ArrayList<String> splitWP() {
        StringTokenizer st = new StringTokenizer(text,delims,true);
        ArrayList<String> a = new ArrayList<String>();
        while (st.hasMoreTokens()) {
            a.add(st.nextToken());
        }
        return a;
    }
    
    /**
     * Counts the number of "chunks" making up this DParagraph
     * @return int number of chunks (words and punctuation characters)
     */
    public int countChunks() {
        return parts.length;
    }
    
    /**
     * Counts the number of words making up this DParagraph.  Unlike <CODE>countChunks</CODE>,
     * we do not count the punctuation.
     * @return int number of words in the DParagraph
     */
    public int countWords() {
        return wordIndex.length;
    }
    
    /**
     * Counts the number of distinct input lines that were incorporated into this
     * DParagraph.
     * @return int number of lines
     */
    public int countLines() {
        return lineIndex.length;
    }
    
    /**
     * Retrieves the ith non-whitespace chunk of the DParagraph
     * @param i The index of the chunk to retrieve
     * @return String holding the chunk
     */
    public String getChunk(int i) {
        if (i<0 || i>=parts.length) return null;
        return parts[i];
    }
    
    /**
     * Retrieves the ith word of the DParagraph; non-punctuation chunks are words.
     * @param i The index of the word to retrieve
     * @return String of the word
     */
    public String getWord(int i) {
        if (i<0 || i>=wordIndex.length) return null;
        return parts[wordIndex[i]];
    }
    
    /**
     * Computes a canonical form of a word, as the basis for constructing
     * the WordTrie elements of the Dict.  The canonical form lower-cases
     * the entire word and converts all forms of apostrophe (single-quote)
     * into the basic '.
     * @param i int selects the word
     * @return null if i is out of range, or the canonical form of the word selected
     */
    public String getCanonicalWord(int i) {
        String w = getWord(i);
        return (w==null)? null : canonicalizeApostrophe(w.toLowerCase());
    }
    
    /**
     * Converts an index to the k-th word to an index to its corresponding chunk.
     * @param i int word number
     * @return index of the corresponding chunk that holds that word
     */
    public int getChunkIndexFromWord(int i) {
        if (i<0) return 0;
        if (i>=wordIndex.length) return parts.length;
        return wordIndex[i];
    }
    
    public String phraseSpanningWords(int begword, int endword) {
        int begchunk = getChunkIndexFromWord(begword);
        if (endword>=wordIndex.length) {
            return text.substring(textIndex[begchunk]);
        } else {
            int endchunk = getChunkIndexFromWord(endword);
            return text.substring(textIndex[begchunk], textIndex[endchunk]);
        }
    }
    
    String restWP() {
        StringBuffer rest = new StringBuffer();
        String sep = "";
        boolean seenFirst = false;
        for (int i = 0; i<parts.length; i++) {
            if (!isWhite(parts[i]) && !isPunct(parts[i])) {
                if (seenFirst) {
                    rest.append(sep);
                    rest.append(parts[i]);
                    sep = " ";
                }
                seenFirst = true;
            }
        }
        return rest.toString();
    }
    
    
    public String toString() {
        return "<DParagraph: " + text + ">";
    }
    
    static final int IDENTICAL = 5;
    static final int SAME = 4;
    static final int CASE = 3;
    static final int APPROX = 2;
    static final int CONTEXT = 1;
    static final int NO_MATCH = 0;
    
    /**
     * Compares <CODE>this</CODE> DParagraph to an <CODE>other</CODE> DParagraph.
     * The comparison is between the entire <CODE>this</CODE> and the part of
     * <CODE>other</CODE> spanning from <CODE>startWord</CODE>
     * until <CODE>endWord</CODE>, not including <CODE>endWord</CODE>.
     * Thus, the number of successive
     * words compared is <CODE>endWord-startWord</CODE>.  The results are
     * as follows:
     * <ol><li value=5>IDENTICAL, including punctuation</li>
     * <li value=4>SAME, all words except punctuation</li>
     * <li value=3>CASE, same words, but in different case(s)</li>
     * <li value=0>NO_MATCH, failed to match</li>
     * </ol>
     *Note that this match is asymmetric, because the number of words to
     *match must be given by the number of words in <CODE>this</CODE>.
     *
     * @param other DParagraph against which to match this
     * @param startWord index of starting word in other
     * @param endWord index of ending word in other
     * @return int as specified above; 0 is no match.
     */
    
    public int matches(DParagraph other, int startWord, int endWord) {
        int nWords = countWords();
        int otherNWords = other.countWords();
        if (nWords!=(endWord-startWord)
        || startWord<0 || endWord>other.countWords()) return NO_MATCH;
        int startChunk = other.wordIndex[startWord];
        int endChunk = (endWord==otherNWords) ? other.parts.length 
                : other.wordIndex[endWord];
        boolean perfectMatch = true;
        if (parts.length!=(endChunk-startChunk)) {
            perfectMatch = false;
        } else {
            for (int i = 0; i<parts.length; i++) {
                if (!parts[i].equals(other.parts[i+startChunk])) {
                    perfectMatch = false;
                    break;
                }
            }
        }
        if (perfectMatch) return IDENTICAL;
        int tentative = SAME;
        for (int i = 0; i<nWords; i++) {
            if (!getWord(i).equalsIgnoreCase(other.getWord(i+startWord))) {
                // No match, even with case difference!
                return NO_MATCH;
            }
            if (!getWord(i).equals(other.getWord(i+startWord))) {
                // Matched (above) ignoring case, but not strictly.
                tentative = CASE;
            }
        }
        // If we got here, all words matched, perhaps with case differences.
        return tentative;
    }
    
    public int matches(DParagraph other, int startWord) {
        return matches(other, startWord, startWord+countWords());
    }
    
    public int matches(DParagraph other) {
        return matches(other, 0);
    }
    
    /**
     * Tests for equality of two Paragraphs.  Under this form of strict equality, they
     * must contain the same succesion of words and punctuation, and may differ only
     * in white space.
     * @param other DParagraph against which to compare.
     * @return boolean value telling whether the comparison succeeded or not.
     */
    public boolean equals(DParagraph other) {
        if (parts.length != other.parts.length) return false;
        return equals(other,0, parts.length);
    }
    
    /**
     * Tests for equality of this DParagraph with the portion of <CODE>other</CODE>
     * starting at chunk start.  Note that we compare the <I>entire</I> content of <CODE>this</CODE>
     * DParagraph with the selected part of <CODE>other</CODE>.
     * @param other
     * @param start
     * @return
     */
    public boolean equals(DParagraph other, int start) {
        if ((start + parts.length) >= other.parts.length) return false;
        return equals(other, start, start+parts.length);
    }
    
    /**
     * Tests for equality of this DParagraph with the portion of <code>other</code>
     * consisting of chunks <CODE>start</CODE> until <CODE>end</CODE>. Note
     * that the comparison is over this <i>entire<i> DParagraph, but only part
     * of <code>other</code>.
     * @param other
     * @param start
     * @param end
     * @return boolean true iff the sequence of chunks in the DParagraph <CODE>other</CODE> starting
     * at <CODE>start</CODE> and up to (but not including) <CODE>end</CODE> match
     * the complete sequence of chunks in this DParagraph.
     */
    public boolean equals(DParagraph other, int start, int end) {
        if (parts.length != (end-start)) return false;
        for (int i=0; i<parts.length; i++) {
            if (!parts[i].equals(other.parts[start+i])) return false;
        }
        return true;
    }
    
    /**
     * Tests for equality of two Paragraphs.  Similar to <CODE>equals</CODE>
     * but ignores case differences.
     * @param other DParagraph against which to compare.
     * @return boolean value telling whether the comparison succeeded or not.
     */
    public boolean equalsIgnoreCase(DParagraph other) {
        if (parts.length != other.parts.length) return false;
        return equalsIgnoreCase(other, 0, parts.length);
    }
    
    /**
     * Tests for equality of this DParagraph with the portion of <CODE>other</CODE>
     * starting at chunk start.  Note that we compare the <I>entire</I> content of <CODE>this</CODE>
     * DParagraph with the selected part of <CODE>other</CODE>. Similar to <CODE>equals</CODE>
     * but ignores case differences.
     * @param other
     * @param start
     * @return
     */
    public boolean equalsIgnoreCase(DParagraph other, int start) {
        if ((start + parts.length) >= other.parts.length) return false;
        return equalsIgnoreCase(other, start, start+parts.length);
    }
    
    /**
     * Tests for equality of this DParagraph with the portion of <code>other</code>
     * consisting of chunks <CODE>start</CODE> until <CODE>end</CODE>. Note
     * that the comparison is over this <i>entire<i> DParagraph, but only part
     * of <code>other</code>. Similar to <CODE>equals</CODE>
     * but ignores case differences.
     * @param other
     * @param start
     * @param end
     * @return boolean true iff the sequence of chunks in the DParagraph <CODE>other</CODE> starting
     * at <CODE>start</CODE> and up to (but not including) <CODE>end</CODE> match
     * the complete sequence of chunks in this DParagraph.
     */
    public boolean equalsIgnoreCase(DParagraph other, int start, int end) {
        if (parts.length != (end-start)) return false;
        for (int i=0; i<parts.length; i++) {
            if (!parts[i].equalsIgnoreCase(other.parts[start+i])) return false;
        }
        return true;
    }
    
    /**
     * Compares the sequence of words in this DParagraph with those in <CODE>other</CODE>.
     * @param other
     * @return
     */
    public boolean equalsIgnorePunctuation(DParagraph other) {
        if (wordIndex.length != other.wordIndex.length) return false;
        return equalsIgnorePunctuation(other, 0, other.wordIndex.length);
    }
    
    /**
     * Comparest the sequence of words in this DParagraph with those in
     * <CODE>other</CODE>, starting at the <CODE>start</CODE>'th word of other.
     * This permits words to be left over in <CODE>other</CODE>
     * after a successful match.
     * @param other
     * @param start
     * @return
     */
    public boolean equalsIgnorePunctuation(DParagraph other, int start) {
        if ((start+wordIndex.length) >= other.wordIndex.length) return false;
        return equalsIgnorePunctuation(other, start, start+wordIndex.length);
    }
    
    /**
     * Compares this DParagraph to <CODE>other</CODE>, ignoring punctuation.  Because we
     * ignore punctuation, we are only interested in comparing words; thus,
     * unlike in the cases of <code>equals</code> or <code>equalsIgnoreCase</code>,
     * the indices <CODE>start</CODE> and <CODE>end</CODE> refer to the word number,
     * not chunk number.
     * @param other
     * @return
     */
    public boolean equalsIgnorePunctuation(DParagraph other, int start, int end) {
        if (wordIndex.length != (end - start)) return false;
        for (int i=0; i<wordIndex.length; i++) {
            if (!getWord(i).equals(other.getWord(start+i))) return false;
        }
        return true;
    }
    
    /**
     * Compares the sequence of words in this DParagraph with those in <CODE>other</CODE>.
     * @param other
     * @return
     */
    public boolean equalsIgnoreCaseIgnorePunctuation(DParagraph other) {
        if (wordIndex.length != other.wordIndex.length) return false;
        return equalsIgnoreCaseIgnorePunctuation(other, 0, other.wordIndex.length);
    }
    
    /**
     * Comparest the sequence of words in this DParagraph with those in
     * <CODE>other</CODE>, starting at the <CODE>start</CODE>'th word of other.
     * This permits words to be left over in <CODE>other</CODE>
     * after a successful match.
     * @param other
     * @param start
     * @return
     */
    public boolean equalsIgnoreCaseIgnorePunctuation(DParagraph other, int start) {
        if ((start+wordIndex.length) >= other.wordIndex.length) return false;
        return equalsIgnoreCaseIgnorePunctuation(other, start, start+wordIndex.length);
    }
    
    /**
     * Compares this DParagraph to <CODE>other</CODE>, ignoring punctuation.  Because we
     * ignore punctuation, we are only interested in comparing words; thus,
     * unlike in the cases of <code>equals</code> or <code>equalsIgnoreCase</code>,
     * the indices <CODE>start</CODE> and <CODE>end</CODE> refer to the word number,
     * not chunk number.
     * @param other
     * @return
     */
    public boolean equalsIgnoreCaseIgnorePunctuation(DParagraph other, int start, int end) {
        if (wordIndex.length != (end - start)) return false;
        for (int i=0; i<wordIndex.length; i++) {
            if (!getWord(i).equalsIgnoreCase(other.getWord(start+i))) return false;
        }
        return true;
    }
}
