1 files changed, 102 insertions, 0 deletions
diff --git a/js/unicode.js b/js/unicode.js
index 700434d..2199750 100644
--- a/js/unicode.js
+++ b/js/unicode.js
@@ -428,6 +428,107 @@ var unicode = (function(){
     return str;
   }
 
+  // convert \xFF\xFF\xFF to unicode
+  function unescapeFromEscapedBytes (str) {
+    var data_bytes = convertEscapedBytesToBytes(str);
+    var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
+    return convertUnicodeCodePointsToString(unicode_codes);
+  }
+  // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
+  // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
+  function convertEscapedBytesToBytes(str) {
+    var parts = str.split("\\x");
+    parts.shift();  // Trim the first element.
+    var codes = [];
+    var max = Math.pow(2, 8);
+    for (var i = 0; i < parts.length; ++i) {
+      var code = parseInt(parts[i], 16);
+      if (code >= 0 && code < max) {
+        codes.push(code);
+      } else {
+        // Malformed code ignored.
+      }
+    }
+    return codes;
+  }
+  // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ]
+  function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) {
+    var unicode_codes = [];
+    var unicode_code = 0;
+    var num_followed = 0;
+    for (var i = 0; i < utf8_bytes.length; ++i) {
+      var utf8_byte = utf8_bytes[i];
+      if (utf8_byte >= 0x100) {
+        // Malformed utf8 byte ignored.
+      } else if ((utf8_byte & 0xC0) == 0x80) {
+        if (num_followed > 0) {
+          unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f);
+          num_followed -= 1;
+        } else {
+          // Malformed UTF-8 sequence ignored.
+        }
+      } else {
+        if (num_followed == 0) {
+          unicode_codes.push(unicode_code);
+        } else {
+          // Malformed UTF-8 sequence ignored.
+        }
+        if (utf8_byte < 0x80){  // 1-byte
+          unicode_code = utf8_byte;
+          num_followed = 0;
+        } else if ((utf8_byte & 0xE0) == 0xC0) {  // 2-byte
+          unicode_code = utf8_byte & 0x1f;
+          num_followed = 1;
+        } else if ((utf8_byte & 0xF0) == 0xE0) {  // 3-byte
+          unicode_code = utf8_byte & 0x0f;
+          num_followed = 2;
+        } else if ((utf8_byte & 0xF8) == 0xF0) {  // 4-byte
+          unicode_code = utf8_byte & 0x07;
+          num_followed = 3;
+        } else {
+          // Malformed UTF-8 sequence ignored.
+        }
+      }
+    }
+    if (num_followed == 0) {
+      unicode_codes.push(unicode_code);
+    } else {
+      // Malformed UTF-8 sequence ignored.
+    }
+    unicode_codes.shift();  // Trim the first element.
+    return unicode_codes;
+  }
+  // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
+  // [ 0xD840, 0xDC0B ] => [ 0x2000B ]  // A surrogate pair.
+  function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
+    var utf16_codes = [];
+    for (var i = 0; i < unicode_codes.length; ++i) {
+      var unicode_code = unicode_codes[i];
+      if (unicode_code < (1 << 16)) {
+        utf16_codes.push(unicode_code);
+      } else {
+        var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
+        var second = (unicode_code % (1 << 10)) + 0xDC00;
+        utf16_codes.push(first)
+        utf16_codes.push(second)
+      }
+    }
+    return utf16_codes;
+  }
+  // [ 0x3042, 0x3044 ] => "ã‚ã„"
+  function convertUnicodeCodePointsToString(unicode_codes) {
+    var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
+    return convertUtf16CodesToString(utf16_codes);
+  }
+  // [ 0x3042, 0x3044 ] => "ã‚ã„"
+  function convertUtf16CodesToString(utf16_codes) {
+    var unescaped = '';
+    for (var i = 0; i < utf16_codes.length; ++i) {
+      unescaped += String.fromCharCode(utf16_codes[i]);
+    }
+    return unescaped;
+  }
+
   return {
     raw: UNICODE_BLOCK_LIST,
     lookup: UNICODE_LOOKUP,
@@ -437,5 +538,6 @@ var unicode = (function(){
     findGroups: findGroups,
     paginate: paginate,
     escapeToEscapedBytes: escapeToEscapedBytes,
+    unescapeFromEscapedBytes: unescapeFromEscapedBytes,
   }
 })()