summaryrefslogtreecommitdiff
path: root/js/unicode.js
diff options
context:
space:
mode:
authorJules Laplace <jules@okfoc.us>2015-07-16 18:05:47 -0400
committerJules Laplace <jules@okfoc.us>2015-07-16 18:05:47 -0400
commit1863819e9862bf22a2b9a990b7062cb0ade44b0f (patch)
tree62747731346db057ea74610cd3ae499809a5b372 /js/unicode.js
parente7c81c0763d4c4261dc5c0b2502a551b4db69bb0 (diff)
escape unicode as escaped bytes (\x)
Diffstat (limited to 'js/unicode.js')
-rw-r--r--js/unicode.js124
1 files changed, 124 insertions, 0 deletions
diff --git a/js/unicode.js b/js/unicode.js
index cbab781..700434d 100644
--- a/js/unicode.js
+++ b/js/unicode.js
@@ -305,6 +305,129 @@ var unicode = (function(){
}
return groups
}
+
+ // encodes unicode characters as escaped utf16 - \xFFFF
+ // encodes ONLY non-ascii characters
+ function escapeToUtf16 (txt) {
+ var escaped_txt = "", kode
+ for (var i = 0; i < txt.length; i++) {
+ kode = txt.charCodeAt(i)
+ if (kode > 0x7f) {
+ kode = kode.toString(16)
+ switch (kode.length) {
+ case 2:
+ kode = "0" + kode
+ case 3:
+ kode = "0" + kode
+ }
+ escaped_txt += "\\u" + kode
+ }
+ else {
+ escaped_txt += txt[i]
+ }
+ }
+ return escaped_txt
+ }
+
+ // encodes unicode characters as escaped bytes - \xFF
+ // encodes ONLY non-ascii characters
+ function escapeToEscapedBytes (txt) {
+ var escaped_txt = "", kode, utf8_bytes
+ for (var i = 0; i < txt.length; i++) {
+ kode = txt.charCodeAt(i)
+ if (kode > 0x7f) {
+ utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
+ escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
+ }
+ else {
+ escaped_txt += txt[i]
+ }
+ }
+ return escaped_txt
+ }
+
+ // encodes unicode characters as escaped bytes - \xFF
+ // encodes an ENTIRE string
+ function escapeAllToEscapedBytes(str, base) {
+ var unicode_codes = convertStringToUnicodeCodePoints(str);
+ var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes);
+ return convertBytesToEscapedString(data_bytes, 16);
+ }
+ // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84'
+ // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204'
+ function convertBytesToEscapedString(data_bytes, base) {
+ var escaped = '';
+ for (var i = 0; i < data_bytes.length; ++i) {
+ var prefix = (base == 16 ? "\\x" : "\\");
+ var num_digits = base == 16 ? 2 : 3;
+ var escaped_byte = prefix + formatNumber(data_bytes[i], base, num_digits)
+ escaped += escaped_byte;
+ }
+ return escaped;
+ }
+ // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
+ function convertUnicodeCodePointsToBytes(unicode_codes) {
+ var utf8_bytes = [];
+ for (var i = 0; i < unicode_codes.length; ++i) {
+ var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]);
+ utf8_bytes = utf8_bytes.concat(bytes);
+ }
+ return utf8_bytes;
+ }
+ // 0x3042 => [ 0xE3, 0x81, 0x82 ]
+ function convertUnicodeCodePointToUtf8Bytes(unicode_code) {
+ var utf8_bytes = [];
+ if (unicode_code < 0x80) { // 1-byte
+ utf8_bytes.push(unicode_code);
+ } else if (unicode_code < (1 << 11)) { // 2-byte
+ utf8_bytes.push((unicode_code >>> 6) | 0xC0);
+ utf8_bytes.push((unicode_code & 0x3F) | 0x80);
+ } else if (unicode_code < (1 << 16)) { // 3-byte
+ utf8_bytes.push((unicode_code >>> 12) | 0xE0);
+ utf8_bytes.push(((unicode_code >> 6) & 0x3f) | 0x80);
+ utf8_bytes.push((unicode_code & 0x3F) | 0x80);
+ } else if (unicode_code < (1 << 21)) { // 4-byte
+ utf8_bytes.push((unicode_code >>> 18) | 0xF0);
+ utf8_bytes.push(((unicode_code >> 12) & 0x3F) | 0x80);
+ utf8_bytes.push(((unicode_code >> 6) & 0x3F) | 0x80);
+ utf8_bytes.push((unicode_code & 0x3F) | 0x80);
+ }
+ return utf8_bytes;
+ }
+ // "あい" => [ 0x3042, 0x3044 ]
+ function convertStringToUnicodeCodePoints(str) {
+ var surrogate_1st = 0;
+ var unicode_codes = [];
+ for (var i = 0; i < str.length; ++i) {
+ var utf16_code = str.charCodeAt(i);
+ if (surrogate_1st != 0) {
+ if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) {
+ var surrogate_2nd = utf16_code;
+ var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) +
+ (surrogate_2nd - 0xDC00);
+ unicode_codes.push(unicode_code);
+ } else {
+ // Malformed surrogate pair ignored.
+ }
+ surrogate_1st = 0;
+ } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) {
+ surrogate_1st = utf16_code;
+ } else {
+ unicode_codes.push(utf16_code);
+ }
+ }
+ return unicode_codes;
+ }
+ // 0xff => "ff"
+ // 0xff => "377"
+ function formatNumber(number, base, num_digits) {
+ var str = number.toString(base).toUpperCase();
+ for (var i = str.length; i < num_digits; ++i) {
+ str = "0" + str;
+ }
+ return str;
+ }
+
return {
raw: UNICODE_BLOCK_LIST,
lookup: UNICODE_LOOKUP,
@@ -313,5 +436,6 @@ var unicode = (function(){
block: block,
findGroups: findGroups,
paginate: paginate,
+ escapeToEscapedBytes: escapeToEscapedBytes,
}
})()