diff options
Diffstat (limited to 'js/unicode.js')
| -rw-r--r-- | js/unicode.js | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/js/unicode.js b/js/unicode.js index 700434d..2199750 100644 --- a/js/unicode.js +++ b/js/unicode.js @@ -428,6 +428,107 @@ var unicode = (function(){ return str; } + // convert \xFF\xFF\xFF to unicode + function unescapeFromEscapedBytes (str) { + var data_bytes = convertEscapedBytesToBytes(str); + var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes); + return convertUnicodeCodePointsToString(unicode_codes); + } + // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] + // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] + function convertEscapedBytesToBytes(str) { + var parts = str.split("\\x"); + parts.shift(); // Trim the first element. + var codes = []; + var max = Math.pow(2, 8); + for (var i = 0; i < parts.length; ++i) { + var code = parseInt(parts[i], 16); + if (code >= 0 && code < max) { + codes.push(code); + } else { + // Malformed code ignored. + } + } + return codes; + } + // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ] + function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) { + var unicode_codes = []; + var unicode_code = 0; + var num_followed = 0; + for (var i = 0; i < utf8_bytes.length; ++i) { + var utf8_byte = utf8_bytes[i]; + if (utf8_byte >= 0x100) { + // Malformed utf8 byte ignored. + } else if ((utf8_byte & 0xC0) == 0x80) { + if (num_followed > 0) { + unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f); + num_followed -= 1; + } else { + // Malformed UTF-8 sequence ignored. + } + } else { + if (num_followed == 0) { + unicode_codes.push(unicode_code); + } else { + // Malformed UTF-8 sequence ignored. + } + if (utf8_byte < 0x80){ // 1-byte + unicode_code = utf8_byte; + num_followed = 0; + } else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte + unicode_code = utf8_byte & 0x1f; + num_followed = 1; + } else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte + unicode_code = utf8_byte & 0x0f; + num_followed = 2; + } else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte + unicode_code = utf8_byte & 0x07; + num_followed = 3; + } else { + // Malformed UTF-8 sequence ignored. + } + } + } + if (num_followed == 0) { + unicode_codes.push(unicode_code); + } else { + // Malformed UTF-8 sequence ignored. + } + unicode_codes.shift(); // Trim the first element. + return unicode_codes; + } + // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] + // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. + function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { + var utf16_codes = []; + for (var i = 0; i < unicode_codes.length; ++i) { + var unicode_code = unicode_codes[i]; + if (unicode_code < (1 << 16)) { + utf16_codes.push(unicode_code); + } else { + var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; + var second = (unicode_code % (1 << 10)) + 0xDC00; + utf16_codes.push(first) + utf16_codes.push(second) + } + } + return utf16_codes; + } + // [ 0x3042, 0x3044 ] => "ã‚ã„" + function convertUnicodeCodePointsToString(unicode_codes) { + var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); + return convertUtf16CodesToString(utf16_codes); + } + // [ 0x3042, 0x3044 ] => "ã‚ã„" + function convertUtf16CodesToString(utf16_codes) { + var unescaped = ''; + for (var i = 0; i < utf16_codes.length; ++i) { + unescaped += String.fromCharCode(utf16_codes[i]); + } + return unescaped; + } + return { raw: UNICODE_BLOCK_LIST, lookup: UNICODE_LOOKUP, @@ -437,5 +538,6 @@ var unicode = (function(){ findGroups: findGroups, paginate: paginate, escapeToEscapedBytes: escapeToEscapedBytes, + unescapeFromEscapedBytes: unescapeFromEscapedBytes, } })() |
