summaryrefslogtreecommitdiff
path: root/js/unicode.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/unicode.js')
-rw-r--r--js/unicode.js102
1 files changed, 102 insertions, 0 deletions
diff --git a/js/unicode.js b/js/unicode.js
index 700434d..2199750 100644
--- a/js/unicode.js
+++ b/js/unicode.js
@@ -428,6 +428,107 @@ var unicode = (function(){
return str;
}
+ // convert \xFF\xFF\xFF to unicode
+ function unescapeFromEscapedBytes (str) {
+ var data_bytes = convertEscapedBytesToBytes(str);
+ var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
+ return convertUnicodeCodePointsToString(unicode_codes);
+ }
+ // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
+ // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
+ function convertEscapedBytesToBytes(str) {
+ var parts = str.split("\\x");
+ parts.shift(); // Trim the first element.
+ var codes = [];
+ var max = Math.pow(2, 8);
+ for (var i = 0; i < parts.length; ++i) {
+ var code = parseInt(parts[i], 16);
+ if (code >= 0 && code < max) {
+ codes.push(code);
+ } else {
+ // Malformed code ignored.
+ }
+ }
+ return codes;
+ }
+ // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ]
+ function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) {
+ var unicode_codes = [];
+ var unicode_code = 0;
+ var num_followed = 0;
+ for (var i = 0; i < utf8_bytes.length; ++i) {
+ var utf8_byte = utf8_bytes[i];
+ if (utf8_byte >= 0x100) {
+ // Malformed utf8 byte ignored.
+ } else if ((utf8_byte & 0xC0) == 0x80) {
+ if (num_followed > 0) {
+ unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f);
+ num_followed -= 1;
+ } else {
+ // Malformed UTF-8 sequence ignored.
+ }
+ } else {
+ if (num_followed == 0) {
+ unicode_codes.push(unicode_code);
+ } else {
+ // Malformed UTF-8 sequence ignored.
+ }
+ if (utf8_byte < 0x80){ // 1-byte
+ unicode_code = utf8_byte;
+ num_followed = 0;
+ } else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte
+ unicode_code = utf8_byte & 0x1f;
+ num_followed = 1;
+ } else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte
+ unicode_code = utf8_byte & 0x0f;
+ num_followed = 2;
+ } else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte
+ unicode_code = utf8_byte & 0x07;
+ num_followed = 3;
+ } else {
+ // Malformed UTF-8 sequence ignored.
+ }
+ }
+ }
+ if (num_followed == 0) {
+ unicode_codes.push(unicode_code);
+ } else {
+ // Malformed UTF-8 sequence ignored.
+ }
+ unicode_codes.shift(); // Trim the first element.
+ return unicode_codes;
+ }
+ // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
+ // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair.
+ function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
+ var utf16_codes = [];
+ for (var i = 0; i < unicode_codes.length; ++i) {
+ var unicode_code = unicode_codes[i];
+ if (unicode_code < (1 << 16)) {
+ utf16_codes.push(unicode_code);
+ } else {
+ var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
+ var second = (unicode_code % (1 << 10)) + 0xDC00;
+ utf16_codes.push(first)
+ utf16_codes.push(second)
+ }
+ }
+ return utf16_codes;
+ }
+ // [ 0x3042, 0x3044 ] => "あい"
+ function convertUnicodeCodePointsToString(unicode_codes) {
+ var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
+ return convertUtf16CodesToString(utf16_codes);
+ }
+ // [ 0x3042, 0x3044 ] => "あい"
+ function convertUtf16CodesToString(utf16_codes) {
+ var unescaped = '';
+ for (var i = 0; i < utf16_codes.length; ++i) {
+ unescaped += String.fromCharCode(utf16_codes[i]);
+ }
+ return unescaped;
+ }
+
return {
raw: UNICODE_BLOCK_LIST,
lookup: UNICODE_LOOKUP,
@@ -437,5 +538,6 @@ var unicode = (function(){
findGroups: findGroups,
paginate: paginate,
escapeToEscapedBytes: escapeToEscapedBytes,
+ unescapeFromEscapedBytes: unescapeFromEscapedBytes,
}
})()