Skip to content

Commit

Permalink
Merge pull request #10 from evolvedbinary/feature/visible-by
Browse files Browse the repository at this point in the history
Feature introduce visible by attribute
  • Loading branch information
adamretter authored Dec 21, 2023
2 parents cce6950 + 4993e19 commit 193a0b3
Show file tree
Hide file tree
Showing 12 changed files with 386 additions and 138 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ For incremental indexing to work, you need to have two sets of unique ids, one f
<namespace>http://www.w3.org/XML/1998/namespace</namespace>
</namespaceMapping>
</namespaceMappings>
<index name="my-algolia-index-1" documentId="/path/to/unique-id/@хml:id">
<index name="my-algolia-index-1" documentId="/path/to/unique-id/@хml:id" visibleBy="/path/to/unique-id">
<rootObject path="/path/to/element" nodeId="@xml:id">
<attribute name="f1" path="/further/patha"/>
<attribute name="f2" path="/further/pathb" type="integer"/>
Expand All @@ -87,6 +87,8 @@ For incremental indexing to work, you need to have two sets of unique ids, one f
```


An Optional `VisibleBy` attribute can be used to restrict data access when searching the Algolia index

A `rootObject` is equivalent to an object inside an Algolia Index. We create one "rootObject" either for each document, or document fragment (if you specify a path attribute on the rootObject).

An `attribute` (represents a JSON object attribute, not to be confused with an XML attribute) is a simple key/value pair that is extracted from the XML and placed into the Algolia object ("rootObject" as we call it). All of the text nodes or attribute values indicated by the "path" on the "attribute" element will be serialized to a string (and then converted if you set an explicit "type" attribute).
Expand All @@ -99,6 +101,14 @@ An `object` represents a JSON object, and this is where things become fun, we ba

The `name` attribute that is available on the "attribute" and "object" elements allows you to set the name of the field in the JSON object of the Algolia index, this means that name names of your data fields can be different in Algolia to eXist if you wish.

## limiting Objects access to certain users
You can limit data access by setting the `visibleBy` attribute in `collection.xconf` then matching the path in your XML data preferably in the header
You can use this example from out test suit

xml: https://github.com/BCDH/exist-algolia-index/tree/master/src/test/resources/integration/user-specified-visibleBy/VSK.TEST.xml

collection.xconf https://github.com/BCDH/exist-algolia-index/tree/master/src/test/resources/integration/user-specified-visibleBy/collection.xconf

<a name="logging"/>

## Enable logging in eXist (optional)
Expand Down
5 changes: 5 additions & 0 deletions src/main/resources/xsd/exist-algolia-index-config.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@
<xs:documentation>Indicates an element or attribute to use the value of as a unique id for the document, if ommitted the document's id is used</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="visibleBy" use="optional" type="c:absoluteElementPathOrAbsoluteAttributePathType">
<xs:annotation>
<xs:documentation>Sets the rule of who can request the records, if omitted the default value will be public</xs:documentation>
</xs:annotation>
</xs:attribute>
</xs:complexType>
</xs:element>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

package org.humanistika.exist.index.algolia

import java.io.StringWriter
import java.util.{ArrayDeque, Deque, HashMap => JHashMap, Map => JMap, Properties => JProperties}
import javax.xml.namespace.QName

Expand All @@ -31,15 +30,13 @@ import org.exist_db.collection_config._1.{Algolia, LiteralType, Properties, Root
import org.exist_db.collection_config._1.LiteralType._
import Serializer._
import akka.actor.ActorRef
import com.fasterxml.jackson.core.{JsonFactory, JsonGenerator}
import grizzled.slf4j.Logger
import org.exist.indexing.StreamListener.ReindexMode
import org.exist.numbering.DLN
import org.humanistika.exist.index.algolia.NodePathWithPredicates.{AtomicEqualsComparison, AtomicNotEqualsComparison, ComponentType, SequenceEqualsComparison}
import org.humanistika.exist.index.algolia.backend.IncrementalIndexingManagerActor.{Add, FinishDocument, RemoveForDocument, StartDocument}
import org.w3c.dom._
import JsonUtil.writeValueField
import org.exist.util.serializer.SAXSerializer


import cats.syntax.either._

Expand Down Expand Up @@ -158,7 +155,7 @@ object AlgoliaStreamListener {
.getOrElse(new NodePath())
}

case class UserSpecifiedDocumentPathId(path: NodePath, value: Option[UserSpecifiedDocumentId])
case class UserSpecifiedOption(path: NodePath, value: Option[String])

case class PartialRootObject(indexName: IndexName, config: RootObject, indexable: IndexableRootObject) {
def identityEquals(other: PartialRootObject) : Boolean = {
Expand Down Expand Up @@ -202,7 +199,8 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i

private var replacingDocument: Boolean = false
private var processing: Map[NodePath, Seq[PartialRootObject]] = Map.empty
private var userSpecifiedDocumentIds: Map[IndexName, UserSpecifiedDocumentPathId] = Map.empty
private var userSpecifiedDocumentIds: Map[IndexName, UserSpecifiedOption] = Map.empty
private var userSpecifiedVisibleByIds: Map[IndexName, UserSpecifiedOption] = Map.empty
private var userSpecifiedNodeIds: Map[(IndexName, NodePath), Option[UserSpecifiedNodeId]] = Map.empty

case class ContextElement(name: QName, attributes: Map[QName, String])
Expand All @@ -224,8 +222,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
override def startIndexDocument(transaction: Txn) {
// find any User Specified Document IDs that we need to complete
this.userSpecifiedDocumentIds = indexConfigs
.map{ case (indexName, index) => indexName -> Option(index.getDocumentId).map(path => UserSpecifiedDocumentPathId(nodePath(ns, path), None)) }
.collect{ case (indexName, Some(usdid)) => indexName -> usdid }
.map{ case (indexName, index) => Tuple2(indexName , Option(index.getDocumentId).map(path => UserSpecifiedOption(nodePath(ns, path), None))) }
.collect{ case (indexName, Some(usdid)) => Tuple2(indexName , usdid) }

// find any User Specified VisibleBYs that we need to complete
this.userSpecifiedVisibleByIds = indexConfigs
.map{ case (indexName, index) => Tuple2(indexName , Option(index.getVisibleBy).map(path => UserSpecifiedOption(nodePath(ns, path), None))) }
.collect{ case (indexName, Some(usvb)) => Tuple2(indexName , usvb) }

getWorker.getMode() match {
case ReindexMode.STORE =>
Expand All @@ -243,9 +246,6 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
// update the current context
context.push(ContextElement(element.getQName.toJavaQName, Map.empty))

// update any userSpecifiedDocumentIds which we haven't yet completed and that match this element path
updateUserSpecifiedDocumentIds(pathClone, element.asLeft)

getWorker.getMode() match {
case ReindexMode.STORE =>
startElementForStore(transaction, element, pathClone)
Expand Down Expand Up @@ -282,6 +282,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
super.attribute(transaction, attrib, pathClone)
}


override def characters(transaction: Txn, text: AbstractCharacterData, path: NodePath): Unit = {
val pathClone = path.duplicate
// update any userSpecifiedVisibleIds which we haven't yet completed and that match this element path
updateUserSpecifiedVisibleIds(pathClone, text)
}

override def endElement(transaction: Txn, element: ElementImpl, path: NodePath) {
getWorker.getMode() match {
case ReindexMode.STORE =>
Expand Down Expand Up @@ -313,6 +320,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i

// clear any User Specified Document IDs
this.userSpecifiedDocumentIds = Map.empty
this.userSpecifiedVisibleByIds = Map.empty

this.context.clear()

Expand Down Expand Up @@ -344,6 +352,21 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
}
}

private def updateUserSpecifiedVisibleIds(path: NodePath, node: Node): Unit = {
for ((indexName, usvb) <- userSpecifiedVisibleByIds if usvb.value.isEmpty && usvb.path.equals(path)) { //TODO(AR) do we need to compare the index name?
getStringFromNode(node) match {
case Right(idValue) if (!idValue.isEmpty) =>
this.userSpecifiedVisibleByIds = userSpecifiedVisibleByIds + (indexName -> usvb.copy(value = Some(idValue)))

case Right(idValue) if (idValue.isEmpty) =>
logger.error(s"UserSpecifiedNodeIds: Unable to use empty string for attribute path=${path}")

case Left(ts) =>
logger.error(s"UserSpecifiedNodeIds: Unable to serialize attribute for path=${path})")
}
}
}

private def updateUserSpecifiedNodeIds(path: NodePath, attrib: AttrImpl): Unit = {
for (((indexName, nodeIdPath), usnid) <- userSpecifiedNodeIds if usnid.isEmpty && nodeIdPath.equals(path)) { //TODO(AR) do we need to compare the index name?
getString(attrib.asRight) match {
Expand All @@ -365,7 +388,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
private def removeForDocument() = {
val docId = getWorker.getDocument.getDocId
for(indexName <- indexConfigs.keys) {
incrementalIndexingActor ! RemoveForDocument(indexName, docId, userSpecifiedDocumentIds.get(indexName).flatMap(_.value))
incrementalIndexingActor ! RemoveForDocument(indexName, docId, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), userSpecifiedVisibleByIds.get(indexName).flatMap(_.value))
}
}

Expand All @@ -379,7 +402,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
if (documentRootObjects.nonEmpty) {
// as we are just starting a document,
// we aren't processing these yet, so let's record them
val processingAtPath = documentRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument.getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, None, Seq.empty)))
val processingAtPath = documentRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument.getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, None, None, Seq.empty)))
this.processing = processing + (DOCUMENT_NODE_PATH -> processingAtPath)
}
}
Expand All @@ -390,7 +413,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
if (elementRootObjects.nonEmpty) {

// record the new RootObjects that we are processing
val newElementRootObjects: Seq[PartialRootObject] = elementRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument().getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, Some(element.getNodeId.toString), None, Seq.empty)))
val newElementRootObjects: Seq[PartialRootObject] = elementRootObjects.map(rootObjectConfig => PartialRootObject(rootObjectConfig._1, rootObjectConfig._2, IndexableRootObject(indexWorker.getDocument().getCollection.getURI.getCollectionPath, indexWorker.getDocument().getCollection.getId, indexWorker.getDocument().getDocId, None, None, Some(element.getNodeId.toString), None, Seq.empty)))
val processingAtPath = processing.get(pathClone) match {
case Some(existingElementRootObjects) =>
// we filter out newElementRootObjects that are equivalent to elementRootObjects which we are already processing
Expand Down Expand Up @@ -421,7 +444,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
if (elementRootObjects.nonEmpty) {
// index them
elementRootObjects
.foreach(partialRootObject => index(partialRootObject.indexName, partialRootObject.indexable.copy(userSpecifiedDocumentId = getUserSpecifiedDocumentIdOrWarn(partialRootObject.indexName), userSpecifiedNodeId = getUserSpecifiedNodeIdOrWarn(partialRootObject.indexName, pathClone))))
.foreach(partialRootObject => index(partialRootObject.indexName, partialRootObject.indexable.copy(userSpecifiedDocumentId = getUserSpecifiedDocumentIdOrWarn(partialRootObject.indexName), userSpecifiedVisibleBy = getUserSpecifiedVisibleByOrWarn(partialRootObject.indexName), userSpecifiedNodeId = getUserSpecifiedNodeIdOrWarn(partialRootObject.indexName, pathClone))))

// finished... so remove them from the map of things we are processing
this.processing = processing.view.filterKeys(_ != pathClone).toMap
Expand All @@ -441,12 +464,13 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
}

// finish indexing any documents for which we have IndexableRootObjects
indexConfigs.keys.foreach(indexName => finishDocumentIndex(indexName, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), indexWorker.getDocument.getCollection.getId, indexWorker.getDocument.getDocId))
indexConfigs.keys.foreach(indexName => finishDocumentIndex(indexName, userSpecifiedDocumentIds.get(indexName).flatMap(_.value), userSpecifiedVisibleByIds.get(indexName).flatMap(_.value), indexWorker.getDocument.getCollection.getId, indexWorker.getDocument.getDocId))

// finished... so clear the map of things we are processing
this.processing = Map.empty

this.userSpecifiedDocumentIds = Map.empty
this.userSpecifiedVisibleByIds = Map.empty
this.userSpecifiedNodeIds = Map.empty
}

Expand All @@ -465,6 +489,21 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
}
}

private def getUserSpecifiedVisibleByOrWarn(indexName: IndexName) : Option[UserSpecifiedVisibleBy] = {
userSpecifiedVisibleByIds.get(indexName) match {
case Some(userSpecifiedVisibleBy) =>
userSpecifiedVisibleBy.value match {
case value : Some[UserSpecifiedVisibleBy] =>
value
case None =>
logger.warn(s"Unable to find user specified document id for index=${indexName} at path=${userSpecifiedVisibleBy.path}, will use default!")
None
}
case None =>
None
}
}

private def getUserSpecifiedNodeIdOrWarn(indexName: IndexName, rootObjectPath: NodePath) : Option[UserSpecifiedNodeId] = {
val maybeKey = userSpecifiedNodeIds
.keySet
Expand Down Expand Up @@ -576,6 +615,14 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i

private def getString(node: ElementOrAttributeImpl): Either[Seq[Throwable], String] = node.fold(serializeAsText, _.getValue.asRight)

private def getStringFromNode(node: Node): Either[Seq[Throwable], String] = {
node match {
case attr: Attr =>
attr.getValue.asRight
case other =>
serializeAsText(other)
}
}
private def updateProcessingChildren(path: NodePath, node: ElementOrAttributeImpl) {

def nodeIdStr(node: ElementOrAttributeImpl) : String = foldNode(node, _.getNodeId.toString)
Expand Down Expand Up @@ -839,7 +886,7 @@ class AlgoliaStreamListener(indexWorker: AlgoliaIndexWorker, broker: DBBroker, i
incrementalIndexingActor ! Add(indexName, indexableRootObject)
}

private def finishDocumentIndex(indexName: IndexName, userSpecifiedDocumentId: Option[String], collectionId: CollectionId, documentId: DocumentId) {
private def finishDocumentIndex(indexName: IndexName, userSpecifiedDocumentId: Option[String], userSpecifiedVisibleBy: Option[String], collectionId: CollectionId, documentId: DocumentId) {
incrementalIndexingActor ! FinishDocument(indexName, userSpecifiedDocumentId, collectionId, documentId)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ object IndexableRootObjectJsonSerializer {
val OBJECT_ID_FIELD_NAME = "objectID"
val COLLECTION_PATH_FIELD_NAME = "collection"
val DOCUMENT_ID_FIELD_NAME = "documentID"
val RECORD_VISIBLE_BY_FIELD_NAME = "visible_by"
}

class IndexableRootObjectJsonSerializer extends JsonSerializer[IndexableRootObject] {
Expand All @@ -47,6 +48,11 @@ class IndexableRootObjectJsonSerializer extends JsonSerializer[IndexableRootObje
case None =>
gen.writeNumberField(DOCUMENT_ID_FIELD_NAME, value.documentId)
}
value.userSpecifiedVisibleBy match {
case Some(usv) =>
gen.writeStringField(RECORD_VISIBLE_BY_FIELD_NAME, usv)
case None => // do nothing
}

serializeChildren(value.children, gen, serializers)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class AlgoliaIndexManagerActor extends Actor {
val indexActor = getOrCreatePerIndexActor(indexName)
indexActor ! changes

case rfd @ RemoveForDocument(indexName, documentId, userSpecifiedDocumentId) =>
case rfd @ RemoveForDocument(indexName, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
if(logger.isTraceEnabled) {
logger.trace(s"Initiating RemoveForDocument (id=${documentId}, userSpecificDocId=${userSpecifiedDocumentId}) for index: $indexName")
}
Expand Down Expand Up @@ -199,7 +199,7 @@ class AlgoliaIndexActor(indexName: IndexName, algoliaIndex: Index[IndexableRootO



case RemoveForDocument(_, documentId, userSpecifiedDocumentId) =>
case RemoveForDocument(_, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
val batchLogMsgGroupId: BatchLogMsgGroupId = System.nanoTime()

logger.info(s"Sending remove document (msgId=$batchLogMsgGroupId) to Algolia for documentId=$documentId, userSpecificDocId=$userSpecifiedDocumentId in index: $indexName")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object IncrementalIndexingManagerActor {
case class Add(indexName: IndexName, indexableRootObject: IndexableRootObject)
case class FinishDocument(indexName: IndexName, userSpecifiedDocumentId: Option[String], collectionId: CollectionId, documentId: DocumentId)
case class IndexChanges(indexName: IndexName, changes: Changes)
case class RemoveForDocument(indexName: IndexName, documentId: DocumentId, userSpecifiedDocumentId: Option[String])
case class RemoveForDocument(indexName: IndexName, documentId: DocumentId, userSpecifiedDocumentId: Option[String], userSpecifiedVisibleBy: Option[String])
case class RemoveForCollection(indexName: IndexName, collectionPath: String)
case object DropIndexes
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class IndexLocalStoreManagerActor(dataDir: Path) extends Actor {
case indexChanges : IndexChanges =>
context.parent ! indexChanges

case removeForDocument @ RemoveForDocument(indexName, _, _) =>
case removeForDocument @ RemoveForDocument(indexName, _, _, _) =>
val indexActor = getOrCreatePerIndexActor(indexName)
indexActor ! removeForDocument

Expand Down Expand Up @@ -131,7 +131,7 @@ class IndexLocalStoreActor(indexesDir: Path, indexName: String) extends Actor {
this.processing = processing + (documentId -> timestamp)
getOrCreatePerDocumentActor(documentId)

case Add(_, iro @ IndexableRootObject(_, _, documentId, _, _, _, _)) =>
case Add(_, iro @ IndexableRootObject(_, _, documentId, _, _, _, _, _)) =>
val perDocumentActor = getOrCreatePerDocumentActor(documentId)
val timestamp = processing(documentId)
perDocumentActor ! Write(timestamp, iro)
Expand All @@ -152,7 +152,7 @@ class IndexLocalStoreActor(indexesDir: Path, indexName: String) extends Actor {
context.parent ! IndexChanges(indexName, changes)
//TODO(AR) when to delete previous timestamp (after upload into Algolia)

case RemoveForDocument(_, documentId, userSpecifiedDocumentId) =>
case RemoveForDocument(_, documentId, userSpecifiedDocumentId, userSpecifiedVisibleBy) =>
val perDocumentActor = getOrCreatePerDocumentActor(documentId)
val maybeTimestamp = processing.get(documentId)
perDocumentActor ! RemoveDocument(documentId, userSpecifiedDocumentId, maybeTimestamp) // perDocumentActor will stop itself!
Expand Down
Loading

0 comments on commit 193a0b3

Please # to comment.