summaryrefslogtreecommitdiff
path: root/src/app/utils/unicode_utils.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/app/utils/unicode_utils.js')
-rw-r--r--src/app/utils/unicode_utils.js63
1 files changed, 63 insertions, 0 deletions
diff --git a/src/app/utils/unicode_utils.js b/src/app/utils/unicode_utils.js
new file mode 100644
index 0000000..c6a2253
--- /dev/null
+++ b/src/app/utils/unicode_utils.js
@@ -0,0 +1,63 @@
+/**
+ * Functions for dealing with Arabic text.
+ * Based on https://github.com/ahmads/arabicString/
+ * For Farsi, it is sufficient to use the `persianRex` NPM module
+ * @module utils/ar_utils
+ */
+
+import makeEmojiRegexp from "emoji-regex/RGI_Emoji.js";
+
+const rtlPunctuation = "،|؟|«|»|؛|٬";
+const ltrPunctuation = "\\.|:|!|-|\\[|\\]|\\(|\\)|\\\\|/";
+
+const punctuationRegexp = new RegExp(
+ "(" + rtlPunctuation + "|" + ltrPunctuation + ")",
+ "gu"
+);
+const arabicRegexp = new RegExp("[\u0621-\u0652]", "gu");
+const arabicLettersRegexp = new RegExp("[\u0621-\u064A]", "gu");
+const arabicDiacriticsRegexp = new RegExp("[\u064B-\u0652]", "gu");
+const emojiRegexp = makeEmojiRegexp();
+
+/**
+ * The percentage of Arabic letters in the `String`.
+ * @param {String} text Text to process
+ * @returns {Float} percentage from `0.0`` - `1.0`
+ */
+
+export const howArabic = (text) => {
+ if (!text) return 0.0;
+ // strip punctuation, digits and spaces
+ text = text.replace(punctuationRegexp, "").replace(emojiRegexp, "");
+ const match = text.match(arabicRegexp) || "";
+ return match.length / text.length;
+};
+
+/**
+ * Is the `String` Arabic, based on
+ * a given `threshold` between `0` and `1`. Defaults to `0.79`.
+ * @param {string} text Text to process
+ * @param {Float} [threshold=0.79]
+ * @returns {Boolean}
+ */
+
+export const isArabic = (text, threshold) => {
+ threshold = threshold || 0.79;
+ return howArabic(text) >= threshold;
+};
+
+/**
+ * Does the `String` have _any_ Arabic letters.
+ * @param {String} text Text to process
+ * @returns {Boolean}
+ */
+
+export const hasArabic = (text) => !!arabicLettersRegexp.test(text);
+
+/**
+ * Remove the Arabic tashkil -diacritics- from the 'String'.
+ * @param {String} text Text to process
+ * @returns {String}
+ */
+
+export const removeTashkel = (text) => text.replace(arabicDiacriticsRegexp, "");