(svn r25653) -Add: Caret movement by words for CJK languages.

2013-08-05 20:35:31 +00:00
parent 4248d90937
commit e6096cb8af
5 changed files with 220 additions and 123 deletions
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
 class IcuStringIterator : public StringIterator
 {
 	icu::BreakIterator *char_itr; ///< ICU iterator for characters.
+	icu::BreakIterator *word_itr; ///< ICU iterator for words.
 	const char *string;           ///< Iteration string in UTF-8.

+	SmallVector<UChar, 32> utf16_str;      ///< UTF-16 copy of the string.
+	SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
+
 public:
-	IcuStringIterator() : char_itr(NULL)
+	IcuStringIterator() : char_itr(NULL), word_itr(NULL)
 	{
 		UErrorCode status = U_ZERO_ERROR;
 		this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+		this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+
+		*this->utf16_str.Append() = '\0';
+		*this->utf16_to_utf8.Append() = 0;
 	}

 	virtual ~IcuStringIterator()
 	{
 		delete this->char_itr;
+		delete this->word_itr;
 	}

 	virtual void SetString(const char *s)
 	{
 		this->string = s;

+		/* Unfortunately current ICU versions only provide rudimentary support
+		 * for word break iterators (especially for CJK languages) in combination
+		 * with UTF-8 input. As a work around we have to convert the input to
+		 * UTF-16 and create a mapping back to UTF-8 character indices. */
+		this->utf16_str.Clear();
+		this->utf16_to_utf8.Clear();
+
+		while (*s != '\0') {
+			size_t idx = s - this->string;
+
+			WChar c = Utf8Consume(&s);
+			if (c <	0x10000) {
+				*this->utf16_str.Append() = (UChar)c;
+			} else {
+				/* Make a surrogate pair. */
+				*this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
+				*this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
+				*this->utf16_to_utf8.Append() = idx;
+			}
+			*this->utf16_to_utf8.Append() = idx;
+		}
+		*this->utf16_str.Append() = '\0';
+		*this->utf16_to_utf8.Append() = s - this->string;
+
 		UText text = UTEXT_INITIALIZER;
 		UErrorCode status = U_ZERO_ERROR;
-		utext_openUTF8(&text, s, -1, &status);
+		utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
 		this->char_itr->setText(&text, status);
+		this->word_itr->setText(&text, status);
 		this->char_itr->first();
+		this->word_itr->first();
 	}

 	virtual size_t SetCurPosition(size_t pos)
 	{
+		/* Convert incoming position to an UTF-16 string index. */
+		uint utf16_pos = 0;
+		for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
+			if (this->utf16_to_utf8[i] == pos) {
+				utf16_pos = i;
+				break;
+			}
+		}
+
 		/* isBoundary has the documented side-effect of setting the current
 		 * position to the first valid boundary equal to or greater than
 		 * the passed value. */
-		this->char_itr->isBoundary((int32_t)pos);
-		return this->char_itr->current();
+		this->char_itr->isBoundary(utf16_pos);
+		return this->utf16_to_utf8[this->char_itr->current()];
 	}

-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
-		int32_t pos = this->char_itr->next();
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		int32_t pos;
+		switch (what) {
+			case ITER_CHARACTER:
+				pos = this->char_itr->next();
+				break;
+
+			case ITER_WORD:
+				pos = this->word_itr->following(this->char_itr->current());
+				/* The ICU word iterator considers both the start and the end of a word a valid
+				 * break point, but we only want word starts. Move to the next location in
+				 * case the new position points to whitespace. */
+				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
+
+				this->char_itr->isBoundary(pos);
+				break;
+
+			default:
+				NOT_REACHED();
+		}
+
+		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}

-	virtual size_t Prev()
+	virtual size_t Prev(IterType what)
 	{
-		int32_t pos = this->char_itr->previous();
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		int32_t pos;
+		switch (what) {
+			case ITER_CHARACTER:
+				pos = this->char_itr->previous();
+				break;
+
+			case ITER_WORD:
+				pos = this->word_itr->preceding(this->char_itr->current());
+				/* The ICU word iterator considers both the start and the end of a word a valid
+				 * break point, but we only want word starts. Move to the previous location in
+				 * case the new position points to whitespace. */
+				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
+
+				this->char_itr->isBoundary(pos);
+				break;
+
+			default:
+				NOT_REACHED();
+		}
+
+		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}
 };

@@ -742,26 +824,79 @@ public:
 		return this->cur_pos = pos;
 	}

-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
 		assert(this->string != NULL);

 		/* Already at the end? */
 		if (this->cur_pos >= this->len) return END;

-		WChar c;
-		this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
-		return this->cur_pos;
+		switch (what) {
+			case ITER_CHARACTER: {
+				WChar c;
+				this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
+				return this->cur_pos;
+			}
+
+			case ITER_WORD: {
+				WChar c;
+				/* Consume current word. */
+				size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
+				while (this->cur_pos < this->len && !IsWhitespace(c)) {
+					this->cur_pos += offs;
+					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				}
+				/* Consume whitespace to the next word. */
+				while (this->cur_pos < this->len && IsWhitespace(c)) {
+					this->cur_pos += offs;
+					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				}
+
+				return this->cur_pos;
+			}
+
+			default:
+				NOT_REACHED();
+		}
+
+		return END;
 	}

-	virtual size_t Prev()
+	virtual size_t Prev(IterType what)
 	{
 		assert(this->string != NULL);

 		/* Already at the beginning? */
 		if (this->cur_pos == 0) return END;

-		return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+		switch (what) {
+			case ITER_CHARACTER:
+				return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+
+			case ITER_WORD: {
+				const char *s = this->string + this->cur_pos;
+				WChar c;
+				/* Consume preceding whitespace. */
+				do {
+					s = Utf8PrevChar(s);
+					Utf8Decode(&c, s);
+				} while (s > this->string && IsWhitespace(c));
+				/* Consume preceding word. */
+				while (s > this->string && !IsWhitespace(c)) {
+					s = Utf8PrevChar(s);
+					Utf8Decode(&c, s);
+				}
+				/* Move caret back to the beginning of the word. */
+				if (IsWhitespace(c)) Utf8Consume(&s);
+
+				return this->cur_pos = s - this->string;
+			}
+
+			default:
+				NOT_REACHED();
+		}
+
+		return END;
 	}
 };