Merge pull request #10 from evolvedbinary/feature/visible-by

Feature introduce visible by attribute
BCDH · Dec 21, 2023 · 193a0b3 · 193a0b3
2 parents cce6950 + 4993e19
commit 193a0b3
Show file tree

Hide file tree

Showing 12 changed files with 386 additions and 138 deletions.
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ For incremental indexing to work, you need to have two sets of unique ids, one f
                     <namespace>http://www.w3.org/XML/1998/namespace</namespace>
                 </namespaceMapping>
             </namespaceMappings>
-            <index name="my-algolia-index-1" documentId="/path/to/unique-id/@хml:id">
+            <index name="my-algolia-index-1" documentId="/path/to/unique-id/@хml:id" visibleBy="/path/to/unique-id">
                 <rootObject path="/path/to/element" nodeId="@xml:id">
                     <attribute name="f1" path="/further/patha"/>
                     <attribute name="f2" path="/further/pathb" type="integer"/>
@@ -87,6 +87,8 @@ For incremental indexing to work, you need to have two sets of unique ids, one f
 ```
 
 
+An Optional `VisibleBy` attribute can be used to restrict data access when searching the Algolia index
+
 A `rootObject` is equivalent to an object inside an Algolia Index. We create one "rootObject" either for each document, or document fragment (if you specify a path attribute on the rootObject).
 
 An `attribute` (represents a JSON object attribute, not to be confused with an XML attribute) is a simple key/value pair that is extracted from the XML and placed into the Algolia object ("rootObject" as we call it). All of the text nodes or attribute values indicated by the "path" on the "attribute" element will be serialized to a string (and then converted if you set an explicit "type" attribute).
@@ -99,6 +101,14 @@ An `object` represents a JSON object, and this is where things become fun, we ba
 
 The `name` attribute that is available on the "attribute" and "object" elements allows you to set the name of the field in the JSON object of the Algolia index, this means that name names of your data fields can be different in Algolia to eXist if you wish.
 
+## limiting Objects access to certain users
+You can limit data access by setting the `visibleBy` attribute in `collection.xconf` then matching the path in your XML data preferably in the header
+You can use this example from out test suit
+
+xml: https://github.com/BCDH/exist-algolia-index/tree/master/src/test/resources/integration/user-specified-visibleBy/VSK.TEST.xml
+
+collection.xconf https://github.com/BCDH/exist-algolia-index/tree/master/src/test/resources/integration/user-specified-visibleBy/collection.xconf
+
 <a name="logging"/>
 
 ## Enable logging in eXist (optional)

diff --git a/src/main/resources/xsd/exist-algolia-index-config.xsd b/src/main/resources/xsd/exist-algolia-index-config.xsd
@@ -123,6 +123,11 @@
           <xs:documentation>Indicates an element or attribute to use the value of as a unique id for the document, if ommitted the document's id is used</xs:documentation>
         </xs:annotation>
       </xs:attribute>
+      <xs:attribute name="visibleBy" use="optional" type="c:absoluteElementPathOrAbsoluteAttributePathType">
+        <xs:annotation>
+          <xs:documentation>Sets the rule of who can request the records, if omitted the default value will be public</xs:documentation>
+        </xs:annotation>
+      </xs:attribute>
     </xs:complexType>
   </xs:element>
 

diff --git a/src/main/scala/org/humanistika/exist/index/algolia/AlgoliaStreamListener.scala b/src/main/scala/org/humanistika/exist/index/algolia/AlgoliaStreamListener.scala
@@ -17,7 +17,6 @@
 
 package org.humanistika.exist.index.algolia
 
-import java.io.StringWriter
 import java.util.{ArrayDeque, Deque, HashMap => JHashMap, Map => JMap, Properties => JProperties}
 import javax.xml.namespace.QName
 
@@ -31,15 +30,13 @@ import org.exist_db.collection_config._1.{Algolia, LiteralType, Properties, Root
 import org.exist_db.collection_config._1.LiteralType._
 import Serializer._
 import akka.actor.ActorRef
-import com.fasterxml.jackson.core.{JsonFactory, JsonGenerator}
 import grizzled.slf4j.Logger
 import org.exist.indexing.StreamListener.ReindexMode
 import org.exist.numbering.DLN
 import org.humanistika.exist.index.algolia.NodePathWithPredicates.{AtomicEqualsComparison, AtomicNotEqualsComparison, ComponentType, SequenceEqualsComparison}
 import org.humanistika.exist.index.algolia.backend.IncrementalIndexingManagerActor.{Add, FinishDocument, RemoveForDocument, StartDocument}
 import org.w3c.dom._
-import JsonUtil.writeValueField
-import org.exist.util.serializer.SAXSerializer
+
 
 import cats.syntax.either._
 
@@ -158,7 +155,7 @@ object AlgoliaStreamListener {
       .getOrElse(new NodePath())
   }
 
-  case class UserSpecifiedDocumentPathId(path: NodePath, value: Option[UserSpecifiedDocumentId])
+  case class UserSpecifiedOption(path: NodePath, value: Option[String])
 
   case class PartialRootObject(indexName: IndexName, config: RootObject, indexable: IndexableRootObject) {
     def identityEquals(other: PartialRootObject) : Boolean = {
@@ -202,7 +199,8 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
 
   private var replacingDocument: Boolean = false
   private var processing: Map[NodePath, Seq[PartialRootObject]] = Map.empty
-  private var userSpecifiedDocumentIds: Map[IndexName, UserSpecifiedDocumentPathId] = Map.empty
+  private var userSpecifiedDocumentIds: Map[IndexName, UserSpecifiedOption] = Map.empty
+  private var userSpecifiedVisibleByIds: Map[IndexName, UserSpecifiedOption] = Map.empty
   private var userSpecifiedNodeIds: Map[(IndexName, NodePath), Option[UserSpecifiedNodeId]] = Map.empty
 
   case class ContextElement(name: QName, attributes: Map[QName, String])
@@ -224,8 +222,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
   override def startIndexDocument(transaction: Txn) {
     // find any User Specified Document IDs that we need to complete
     this.userSpecifiedDocumentIds = indexConfigs
-      .map{ case (indexName, index) => indexName -> Option(index.getDocumentId).map(path => UserSpecifiedDocumentPathId(nodePath(ns, path), None)) }
-      .collect{ case (indexName, Some(usdid)) => indexName -> usdid }
+      .map{ case (indexName, index) => Tuple2(indexName , Option(index.getDocumentId).map(path => UserSpecifiedOption(nodePath(ns, path), None))) }
+      .collect{ case (indexName, Some(usdid)) => Tuple2(indexName , usdid) }
+
+    // find any User Specified VisibleBYs that we need to complete
+    this.userSpecifiedVisibleByIds = indexConfigs
+      .map{ case (indexName, index) => Tuple2(indexName , Option(index.getVisibleBy).map(path => UserSpecifiedOption(nodePath(ns, path), None))) }
+      .collect{ case (indexName, Some(usvb)) => Tuple2(indexName , usvb) }
 
     getWorker.getMode() match {
       case ReindexMode.STORE =>
@@ -243,9 +246,6 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     // update the current context
     context.push(ContextElement(element.getQName.toJavaQName, Map.empty))
 
-    // update any userSpecifiedDocumentIds which we haven't yet completed and that match this element path
-    updateUserSpecifiedDocumentIds(pathClone, element.asLeft)
-
     getWorker.getMode() match {
       case ReindexMode.STORE =>
         startElementForStore(transaction, element, pathClone)
@@ -282,6 +282,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     super.attribute(transaction, attrib, pathClone)
   }
 
+
+  override def characters(transaction: Txn, text: AbstractCharacterData, path: NodePath): Unit = {
+    val pathClone = path.duplicate
+    // update any userSpecifiedVisibleIds which we haven't yet completed and that match this element path
+    updateUserSpecifiedVisibleIds(pathClone, text)
+  }
+
   override def endElement(transaction: Txn, element: ElementImpl, path: NodePath) {
     getWorker.getMode() match {
       case ReindexMode.STORE =>
@@ -313,6 +320,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
 
     // clear any User Specified Document IDs
     this.userSpecifiedDocumentIds = Map.empty
+    this.userSpecifiedVisibleByIds = Map.empty
 
     this.context.clear()
 
@@ -344,6 +352,21 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     }
   }
 
+  private def updateUserSpecifiedVisibleIds(path: NodePath, node: Node): Unit = {
+    for ((indexName, usvb) <- userSpecifiedVisibleByIds if usvb.value.isEmpty && usvb.path.equals(path)) { //TODO(AR) do we need to compare the index name?
+      getStringFromNode(node) match {
+        case Right(idValue) if (!idValue.isEmpty) =>
+          this.userSpecifiedVisibleByIds = userSpecifiedVisibleByIds + (indexName -> usvb.copy(value = Some(idValue)))
+
+        case Right(idValue) if (idValue.isEmpty) =>
+          logger.error(s"UserSpecifiedNodeIds: Unable to use empty string for attribute path=${path}")
+
+        case Left(ts) =>
+          logger.error(s"UserSpecifiedNodeIds: Unable to serialize attribute for path=${path})")
+      }
+    }
+  }
+
   private def updateUserSpecifiedNodeIds(path: NodePath, attrib: AttrImpl): Unit = {
     for (((indexName, nodeIdPath), usnid) <- userSpecifiedNodeIds if usnid.isEmpty && nodeIdPath.equals(path)) {   //TODO(AR) do we need to compare the index name?
       getString(attrib.asRight) match {
@@ -365,7 +388,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
   private def removeForDocument() = {
     val docId = getWorker.getDocument.getDocId
     for(indexName <- indexConfigs.keys) {
-      incrementalIndexingActor ! RemoveForDocument(indexName, docId, userSpecifiedDocumentIds.get(indexName).flatMap(_.value))
+      incrementalIndexingActor ! RemoveForDocument(indexName, docId, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), userSpecifiedVisibleByIds.get(indexName).flatMap(_.value))
     }
   }
 
@@ -379,7 +402,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     if (documentRootObjects.nonEmpty) {
       // as we are just starting a document,
       // we aren't processing these yet, so let's record them
-      val processingAtPath = documentRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument.getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, None, Seq.empty)))
+      val processingAtPath = documentRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument.getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, None, None, Seq.empty)))
       this.processing = processing + (DOCUMENT_NODE_PATH -> processingAtPath)
     }
   }
@@ -390,7 +413,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     if (elementRootObjects.nonEmpty) {
 
       // record the new RootObjects that we are processing
-      val newElementRootObjects: Seq[PartialRootObject] = elementRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument().getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, Some(element.getNodeId.toString), None, Seq.empty)))
+      val newElementRootObjects: Seq[PartialRootObject] = elementRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument().getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, Some(element.getNodeId.toString), None, Seq.empty)))
       val processingAtPath = processing.get(pathClone) match {
         case Some(existingElementRootObjects) =>
           // we filter out newElementRootObjects that are equivalent to elementRootObjects which we are already processing
@@ -421,7 +444,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     if (elementRootObjects.nonEmpty) {
       // index them
       elementRootObjects
-        .foreach(partialRootObject => index(partialRootObject.indexName, partialRootObject.indexable.copy(userSpecifiedDocumentId = getUserSpecifiedDocumentIdOrWarn(partialRootObject.indexName), userSpecifiedNodeId = getUserSpecifiedNodeIdOrWarn(partialRootObject.indexName, pathClone))))
+        .foreach(partialRootObject => index(partialRootObject.indexName, partialRootObject.indexable.copy(userSpecifiedDocumentId = getUserSpecifiedDocumentIdOrWarn(partialRootObject.indexName), userSpecifiedVisibleBy = getUserSpecifiedVisibleByOrWarn(partialRootObject.indexName), userSpecifiedNodeId = getUserSpecifiedNodeIdOrWarn(partialRootObject.indexName, pathClone))))
 
       // finished... so remove them from the map of things we are processing
       this.processing = processing.view.filterKeys(_ != pathClone).toMap
@@ -441,12 +464,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     }
 
     // finish indexing any documents for which we have IndexableRootObjects
-    indexConfigs.keys.foreach(indexName => finishDocumentIndex(indexName, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), indexWorker.getDocument.getCollection.getId, indexWorker.getDocument.getDocId))
+    indexConfigs.keys.foreach(indexName => finishDocumentIndex(indexName, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), userSpecifiedVisibleByIds.get(indexName).flatMap(_.value), indexWorker.getDocument.getCollection.getId, indexWorker.getDocument.getDocId))
 
     // finished... so clear the map of things we are processing
     this.processing = Map.empty
 
     this.userSpecifiedDocumentIds = Map.empty
+    this.userSpecifiedVisibleByIds = Map.empty
     this.userSpecifiedNodeIds = Map.empty
   }
 
@@ -465,6 +489,21 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     }
   }
 
+  private def getUserSpecifiedVisibleByOrWarn(indexName: IndexName) : Option[UserSpecifiedVisibleBy] = {
+    userSpecifiedVisibleByIds.get(indexName) match {
+      case Some(userSpecifiedVisibleBy) =>
+        userSpecifiedVisibleBy.value match {
+          case value : Some[UserSpecifiedVisibleBy] =>
+            value
+          case None =>
+            logger.warn(s"Unable to find user specified document id for index=${indexName} at path=${userSpecifiedVisibleBy.path}, will use default!")
+            None
+        }
+      case None =>
+        None
+    }
+  }
+
   private def getUserSpecifiedNodeIdOrWarn(indexName: IndexName, rootObjectPath: NodePath) : Option[UserSpecifiedNodeId] = {
     val maybeKey = userSpecifiedNodeIds
       .keySet
@@ -576,6 +615,14 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
 
   private def getString(node: ElementOrAttributeImpl): Either[Seq[Throwable], String] = node.fold(serializeAsText, _.getValue.asRight)
 
+  private def getStringFromNode(node: Node): Either[Seq[Throwable], String] = {
+    node match {
+      case attr: Attr =>
+        attr.getValue.asRight
+      case other =>
+        serializeAsText(other)
+    }
+  }
   private def updateProcessingChildren(path: NodePath, node: ElementOrAttributeImpl) {
 
     def nodeIdStr(node: ElementOrAttributeImpl) : String = foldNode(node, _.getNodeId.toString)
@@ -839,7 +886,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
     incrementalIndexingActor ! Add(indexName, indexableRootObject)
   }
 
-  private def finishDocumentIndex(indexName: IndexName, userSpecifiedDocumentId: Option[String], collectionId: CollectionId, documentId: DocumentId) {
+  private def finishDocumentIndex(indexName: IndexName, userSpecifiedDocumentId: Option[String], userSpecifiedVisibleBy: Option[String], collectionId: CollectionId, documentId: DocumentId) {
     incrementalIndexingActor ! FinishDocument(indexName, userSpecifiedDocumentId, collectionId, documentId)
   }
 }
diff --git a/src/main/scala/org/humanistika/exist/index/algolia/IndexableRootObjectJsonSerializer.scala b/src/main/scala/org/humanistika/exist/index/algolia/IndexableRootObjectJsonSerializer.scala
@@ -29,6 +29,7 @@ object IndexableRootObjectJsonSerializer {
   val OBJECT_ID_FIELD_NAME = "objectID"
   val COLLECTION_PATH_FIELD_NAME = "collection"
   val DOCUMENT_ID_FIELD_NAME = "documentID"
+  val RECORD_VISIBLE_BY_FIELD_NAME = "visible_by"
 }
 
 class IndexableRootObjectJsonSerializer extends JsonSerializer[IndexableRootObject] {
@@ -47,6 +48,11 @@ class IndexableRootObjectJsonSerializer extends JsonSerializer[IndexableRootObje
       case None =>
         gen.writeNumberField(DOCUMENT_ID_FIELD_NAME, value.documentId)
     }
+    value.userSpecifiedVisibleBy match {
+      case Some(usv) =>
+        gen.writeStringField(RECORD_VISIBLE_BY_FIELD_NAME, usv)
+      case None => // do nothing
+    }
 
     serializeChildren(value.children, gen, serializers)
 

diff --git a/src/main/scala/org/humanistika/exist/index/algolia/backend/AlgoliaIndexManagerActor.scala b/src/main/scala/org/humanistika/exist/index/algolia/backend/AlgoliaIndexManagerActor.scala
@@ -59,7 +59,7 @@ class AlgoliaIndexManagerActor extends Actor {
       val indexActor = getOrCreatePerIndexActor(indexName)
       indexActor ! changes
 
-    case rfd @ RemoveForDocument(indexName, documentId, userSpecifiedDocumentId) =>
+    case rfd @ RemoveForDocument(indexName, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
       if(logger.isTraceEnabled) {
         logger.trace(s"Initiating RemoveForDocument (id=${documentId}, userSpecificDocId=${userSpecifiedDocumentId}) for index: $indexName")
       }
@@ -199,7 +199,7 @@ class AlgoliaIndexActor(indexName: IndexName, algoliaIndex: Index[IndexableRootO
 
 
 
-    case RemoveForDocument(_, documentId, userSpecifiedDocumentId) =>
+    case RemoveForDocument(_, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
       val batchLogMsgGroupId: BatchLogMsgGroupId = System.nanoTime()
 
       logger.info(s"Sending remove document (msgId=$batchLogMsgGroupId) to Algolia for documentId=$documentId, userSpecificDocId=$userSpecifiedDocumentId in index: $indexName")

diff --git a/...n/scala/org/humanistika/exist/index/algolia/backend/IncrementalIndexingManagerActor.scala b/...n/scala/org/humanistika/exist/index/algolia/backend/IncrementalIndexingManagerActor.scala
@@ -33,7 +33,7 @@ object IncrementalIndexingManagerActor {
   case class Add(indexName: IndexName, indexableRootObject: IndexableRootObject)
   case class FinishDocument(indexName: IndexName, userSpecifiedDocumentId: Option[String], collectionId: CollectionId, documentId: DocumentId)
   case class IndexChanges(indexName: IndexName, changes: Changes)
-  case class RemoveForDocument(indexName: IndexName, documentId: DocumentId, userSpecifiedDocumentId: Option[String])
+  case class RemoveForDocument(indexName: IndexName, documentId: DocumentId, userSpecifiedDocumentId: Option[String], userSpecifiedVisibleBy: Option[String])
   case class RemoveForCollection(indexName: IndexName, collectionPath: String)
   case object DropIndexes
 }

diff --git a/src/main/scala/org/humanistika/exist/index/algolia/backend/IndexLocalStoreManagerActor.scala b/src/main/scala/org/humanistika/exist/index/algolia/backend/IndexLocalStoreManagerActor.scala
@@ -84,7 +84,7 @@ class IndexLocalStoreManagerActor(dataDir: Path) extends Actor {
     case indexChanges : IndexChanges =>
       context.parent ! indexChanges
 
-    case removeForDocument @ RemoveForDocument(indexName, _, _) =>
+    case removeForDocument @ RemoveForDocument(indexName, _, _, _) =>
       val indexActor = getOrCreatePerIndexActor(indexName)
       indexActor ! removeForDocument
 
@@ -131,7 +131,7 @@ class IndexLocalStoreActor(indexesDir: Path, indexName: String) extends Actor {
       this.processing = processing + (documentId -> timestamp)
       getOrCreatePerDocumentActor(documentId)
 
-    case Add(_, iro @ IndexableRootObject(_, _, documentId, _, _, _, _)) =>
+    case Add(_, iro @ IndexableRootObject(_, _, documentId, _, _, _, _, _)) =>
       val perDocumentActor = getOrCreatePerDocumentActor(documentId)
       val timestamp = processing(documentId)
       perDocumentActor ! Write(timestamp, iro)
@@ -152,7 +152,7 @@ class IndexLocalStoreActor(indexesDir: Path, indexName: String) extends Actor {
       context.parent ! IndexChanges(indexName, changes)
     //TODO(AR) when to delete previous timestamp (after upload into Algolia)
 
-    case RemoveForDocument(_, documentId, userSpecifiedDocumentId) =>
+    case RemoveForDocument(_, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
       val perDocumentActor = getOrCreatePerDocumentActor(documentId)
       val maybeTimestamp = processing.get(documentId)
       perDocumentActor ! RemoveDocument(documentId, userSpecifiedDocumentId, maybeTimestamp)  // perDocumentActor will stop itself!