importing rtf

author: Jules Laplace <julescarbon@gmail.com> 2021-08-23 21:33:42 +0200
committer: Jules Laplace <julescarbon@gmail.com> 2021-08-23 21:33:42 +0200
commit: a05d52a7b13607181ce0443b17769bb02532dfc1 (patch)
tree: f5c4722363f29a610d1a4f95efbd9351999083f6 /load_spreadsheet.js
parent: 5ddde1cbb70bf4bc2df127fced5afb966069d299 (diff)
1 files changed, 172 insertions, 21 deletions
diff --git a/load_spreadsheet.js b/load_spreadsheet.js
index 347c6eb..3bada97 100644
--- a/load_spreadsheet.js
+++ b/load_spreadsheet.js
@@ -3,30 +3,24 @@
  */
 
 import { loadJSON, loadCSV, writeJSON } from "./file_utils.js";
+import { readdir } from "fs/promises";
+import parseRTF from "rtf-parser";
+import fs from "fs";
+import sizeOf from "image-size";
 
 const datasheetFile = "./data_store/tags.csv";
 const dbFile = "./db.json";
 
-var tagTypes = [
-  "No6092",
-  "1620s",
-  "painting",
-  "blunt",
-  "National Gallery of Canada",
-  "AGO",
-  "courtauld",
-  "intervensions",
-  "connosieurship",
-  "double agent",
-  "forensics",
-  "black box",
-];
+var tagTypes = "No6092,1620s,painting,blunt,National Gallery of Canada,AGO,courtauld,intervensions,connsoeurship,double agent,forensics,black box,Stankievech".split(
+  ","
+);
 
 async function main() {
   // basically this script exists to assign the X'd fields from the spreadsheet
   // to the okcms json :)
   const data = await loadCSV(datasheetFile);
   const db = await loadJSON(dbFile);
+  const dataStore = await loadDataStoreIndex("./data_store");
 
   db.page = db.page || [];
   db.ui = db.ui || [];
@@ -39,30 +33,187 @@ async function main() {
   }, {});
 
   // loop over the CSV data :)
-  data.forEach((row, index) => {
-    const cell = pageById[index] || {
+  let index = -1;
+  for (let row of data) {
+    index += 1;
+    const record = pageById[index] || {
       __index: index,
-      id: "post_" + index,
+      id: "page_" + index,
       title: row.Title,
     };
     // loop over the tags...
     let tagIndex = 0;
     tagTypes.forEach((type, tagId) => {
       if (row[type] === "x") {
-        cell["tag_" + tagIndex] = tagId + 1;
+        record["tag_" + tagIndex] = tagId + 1;
         tagIndex += 1;
       }
     });
     // make sure all other tags are cleared out
     for (; tagIndex < 9; tagIndex++) {
-      cell["tag_" + tagIndex] = 0;
+      record["tag_" + tagIndex] = 0;
     }
+    // if there is a corresponding record in the data store, accumulate it
+    if (String(index + 1) in dataStore) {
+      await loadFiles(dataStore[index + 1], record);
+    }
+    // if we haven't seen this ID before, append it
     if (!pageById[index]) {
-      db.page.push(cell);
+      db.page.push(record);
     }
-  });
+  }
 
   await writeJSON(dbFile, db);
 }
 
+async function loadDataStoreIndex(path) {
+  const files = await readdir(path);
+  let parts, index;
+  let folders = {};
+  for (const file of files) {
+    if (file.match(".csv")) continue;
+    if (file.match(".DS_Store")) continue;
+    parts = file.split("-");
+    index = parts[0].trim().replace(/^0/, "");
+    folders[index] = file;
+  }
+  return folders;
+}
+
+async function loadFiles(folder, record) {
+  const path = `./data_store/${folder}/`;
+  const files = await readdir(path);
+  const images = (record.images = []);
+  let dimensions;
+  for (const file of files) {
+    if (file.match(".DS_Store")) continue;
+    if (file.match(/-URL.rtf/i)) {
+      await loadLink(path + file, record);
+    } else if (file.match(/.rtf/i)) {
+      await loadText(path + file, record);
+    } else if (file.match(/.txt/i)) {
+      console.error("+ fix text file", path + file);
+    } else if (file.match(/-thumb/i)) {
+      dimensions = sizeOf(path + file);
+      record.thumbnail = {
+        uri: `assets/data_store/${folder}/${file}`,
+        caption: "",
+        ...dimensions,
+      };
+    } else {
+      dimensions = sizeOf(path + file);
+      images.push({
+        uri: `assets/data_store/${folder}/${file}`,
+        caption: "",
+        ...dimensions,
+      });
+    }
+  }
+}
+
+async function loadText(path, record) {
+  return new Promise((resolve, reject) => {
+    parseRTF.stream(fs.createReadStream(path), (err, doc) => {
+      const paragraphs = doc.content.filter((para) => para.content);
+      const finalParagraph = doc.content.filter((para) => !para.content);
+      record.citation = "";
+      record.description = "";
+      let groupCount = 0;
+      let content = "";
+      record.author = "";
+      record.title = "";
+      paragraphs.forEach((para, paragraphIndex) => {
+        const paragraph = [];
+        para.content.forEach((clip) => {
+          switch (paragraphIndex) {
+            case 0: // number
+              // console.log(clip.value);
+              return;
+            case 1: // author
+              // console.log(clip.value);
+              record.author += getClipValue(clip);
+              return;
+            case 2: // title
+              // console.log(clip.value);
+              record.title += getClipValue(clip);
+              return;
+            default:
+              appendClip(paragraph, clip);
+          }
+        });
+        if (paragraph.length) {
+          if (groupCount < 3) {
+            record.citation += paragraph.join("") + "<br>\n";
+          } else {
+            content += "<p>\n" + paragraph.join("") + "\n</p>\n\n";
+          }
+        }
+        if (!para.content.length) {
+          groupCount += 1;
+        }
+      });
+      const finalParagraphExtract = [];
+      finalParagraph.forEach((clip) => {
+        appendClip(finalParagraphExtract, clip);
+      });
+      if (finalParagraphExtract.length) {
+        content += "<p>\n" + finalParagraphExtract.join("") + "\n</p>\n\n";
+      }
+
+      record.description = content;
+      resolve();
+    });
+  });
+}
+
+function appendClip(paragraph, clip) {
+  paragraph.push(getClipValue(clip));
+}
+function getClipValue(clip) {
+  if (clip.style.italic) {
+    return "<i>" + clip.value + "</i>";
+  } else if (clip.style.underline) {
+    return "<u>" + clip.value + "</u>";
+  } else {
+    return clip.value;
+  }
+}
+
+async function loadLink(path, record) {
+  return new Promise((resolve, reject) => {
+    parseRTF.stream(fs.createReadStream(path), (err, doc) => {
+      const paragraphs = doc.content;
+      let uri;
+      paragraphs.forEach((para, paragraphIndex) => {
+        const paragraph = [];
+        para.content?.forEach((clip) => {
+          if (clip.value.match(/^http/)) {
+            uri = clip.value.trim();
+          }
+        });
+        if (para.value?.match(/^http/)) {
+          uri = para.value.trim();
+        }
+      });
+      let match = uri.match(/\d+/);
+      let token = "";
+      if (match) {
+        token = match[0];
+      } else {
+        console.error("No token:", uri);
+      }
+      record.type = "video";
+      record.images = [
+        {
+          type: "video",
+          caption: "",
+          uri,
+          token,
+        },
+      ];
+      resolve();
+    });
+  });
+}
+
 main().then(() => process.exit(0));
author	Jules Laplace <julescarbon@gmail.com>	2021-08-23 21:33:42 +0200
committer	Jules Laplace <julescarbon@gmail.com>	2021-08-23 21:33:42 +0200
commit	a05d52a7b13607181ce0443b17769bb02532dfc1 (patch)
tree	f5c4722363f29a610d1a4f95efbd9351999083f6 /load_spreadsheet.js
parent	5ddde1cbb70bf4bc2df127fced5afb966069d299 (diff)