Skip to content

Commit

Permalink
feat: Adding capability use Cognitive Service Language Service asynch…
Browse files Browse the repository at this point in the history
…ronously for Summarization (#2342)

* Adding capability use Cognitive Service Language Service asynchronously. The transformer calls the async service and poll for result. The polling delay and max retry attempts is controlled by parameters. Request creation for each task is extracted into separate trait to make code more readable and manageable. There has been minimal changes in AnalyzeText class.

* Adding unit test and fixing failing style test

* Adding unit test and fixing style for the test.

* Adding support for Custom MultiLabel Classification and Single Label classification. Unit tests are added to validate that requests and response are correct. Also added tiemout for AbstractiveSummary requests.

* fixing minor problems and documentation

* Fixing parameter name to reflect the name of field

* Fixing failing fuzzing tests.

* making traits and methods package private

---------

Co-authored-by: Farrukh Masud <farrukhmasud@microsoft.com>
  • Loading branch information
FarrukhMasud and FMasudMsft authored Feb 6, 2025
1 parent bab6aed commit 1b5df70
Show file tree
Hide file tree
Showing 6 changed files with 2,234 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,32 @@

package com.microsoft.azure.synapse.ml.services.language

import com.microsoft.azure.synapse.ml.services._
import com.microsoft.azure.synapse.ml.services.text.{TADocument, TextAnalyticsAutoBatch}
import com.microsoft.azure.synapse.ml.logging.{FeatureNames, SynapseMLLogging}
import com.microsoft.azure.synapse.ml.logging.{ FeatureNames, SynapseMLLogging }
import com.microsoft.azure.synapse.ml.param.ServiceParam
import com.microsoft.azure.synapse.ml.stages.{FixedMiniBatchTransformer, FlattenBatch, HasBatchSize, UDFTransformer}
import org.apache.http.entity.{AbstractHttpEntity, StringEntity}
import com.microsoft.azure.synapse.ml.services._
import com.microsoft.azure.synapse.ml.services.text.{ TADocument, TextAnalyticsAutoBatch }
import com.microsoft.azure.synapse.ml.stages.{ FixedMiniBatchTransformer, FlattenBatch, HasBatchSize, UDFTransformer }
import org.apache.http.entity.{ AbstractHttpEntity, StringEntity }
import org.apache.spark.injections.UDFUtils
import org.apache.spark.ml.param.{Param, ParamValidators}
import org.apache.spark.ml.{ ComplexParamsReadable, NamespaceInjections, PipelineModel }
import org.apache.spark.ml.param.{ Param, ParamValidators }
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{ComplexParamsReadable, NamespaceInjections, PipelineModel}
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.types.{ArrayType, DataType, StructType}
import spray.json.DefaultJsonProtocol._
import org.apache.spark.sql.types.{ ArrayType, DataType, StructType }
import spray.json._
import spray.json.DefaultJsonProtocol._

trait AnalyzeTextTaskParameters extends HasServiceParams {
trait HasAnalyzeTextServiceBaseParams extends HasServiceParams {
val modelVersion = new ServiceParam[String](
this, name = "modelVersion", "Version of the model")

def getModelVersion: String = getScalarParam(modelVersion)
def setModelVersion(v: String): this.type = setScalarParam(modelVersion, v)

def getModelVersionCol: String = getVectorParam(modelVersion)
def setModelVersionCol(v: String): this.type = setVectorParam(modelVersion, v)

def getModelVersion: String = getScalarParam(modelVersion)

def getModelVersionCol: String = getVectorParam(modelVersion)

val loggingOptOut = new ServiceParam[Boolean](
this, "loggingOptOut", "loggingOptOut for task"
Expand All @@ -44,13 +43,15 @@ trait AnalyzeTextTaskParameters extends HasServiceParams {
def getLoggingOptOutCol: String = getVectorParam(loggingOptOut)

val stringIndexType = new ServiceParam[String](this, "stringIndexType",
"Specifies the method used to interpret string offsets. " +
"Defaults to Text Elements (Graphemes) according to Unicode v8.0.0. " +
"For additional information see https://aka.ms/text-analytics-offsets",
isValid = {
case Left(s) => Set("TextElements_v8", "UnicodeCodePoint", "Utf16CodeUnit")(s)
case _ => true
})
"Specifies the method used to interpret string offsets. " +
"Defaults to Text Elements(Graphemes) according to Unicode v8.0.0." +
"For more information see https://aka.ms/text-analytics-offsets",
isValid = {
case Left(s) => Set("TextElements_v8",
"UnicodeCodePoint",
"Utf16CodeUnit")(s)
case _ => true
})

def setStringIndexType(v: String): this.type = setScalarParam(stringIndexType, v)

Expand All @@ -60,6 +61,36 @@ trait AnalyzeTextTaskParameters extends HasServiceParams {

def getStringIndexTypeCol: String = getVectorParam(stringIndexType)

val showStats = new ServiceParam[Boolean](
this, name = "showStats", "Whether to include detailed statistics in the response",
isURLParam = true)

def setShowStats(v: Boolean): this.type = setScalarParam(showStats, v)

def getShowStats: Boolean = getScalarParam(showStats)

// We don't support setKindCol here because output schemas for different kind are different
val kind = new Param[String](
this, "kind", "Enumeration of supported Text Analysis tasks",
isValid = validKinds.contains(_)
)

protected def validKinds: Set[String]

def setKind(v: String): this.type = set(kind, v)

def getKind: String = $(kind)

setDefault(
showStats -> Left(false),
modelVersion -> Left("latest"),
loggingOptOut -> Left(false),
stringIndexType -> Left("TextElements_v8")
)
}


trait AnalyzeTextTaskParameters extends HasAnalyzeTextServiceBaseParams {
val opinionMining = new ServiceParam[Boolean](
this, name = "opinionMining", "opinionMining option for SentimentAnalysisTask")

Expand Down Expand Up @@ -98,9 +129,6 @@ trait AnalyzeTextTaskParameters extends HasServiceParams {
def getPiiCategoriesCol: String = getVectorParam(piiCategories)

setDefault(
modelVersion -> Left("latest"),
loggingOptOut -> Left(false),
stringIndexType -> Left("TextElements_v8"),
opinionMining -> Left(false),
domain -> Left("none")
)
Expand Down Expand Up @@ -131,33 +159,21 @@ class AnalyzeText(override val uid: String) extends CognitiveServicesBase(uid)

def this() = this(Identifiable.randomUID("AnalyzeText"))

val showStats = new ServiceParam[Boolean](
this, name = "showStats", "Whether to include detailed statistics in the response",
isURLParam = true)

def setShowStats(v: Boolean): this.type = setScalarParam(showStats, v)

def getShowStats: Boolean = getScalarParam(showStats)
override protected def validKinds: Set[String] = Set("EntityLinking",
"EntityRecognition",
"KeyPhraseExtraction",
"LanguageDetection",
"PiiEntityRecognition",
"SentimentAnalysis")

setDefault(
apiVersion -> Left("2022-05-01"),
showStats -> Left(false)
apiVersion -> Left("2022-05-01")
)

override def urlPath: String = "/language/:analyze-text"

override private[ml] def internalServiceType: String = "textanalytics"

// We don't support setKindCol here because output schemas for different kind are different
val kind = new Param[String](
this, "kind", "Enumeration of supported Text Analysis tasks",
isValid = ParamValidators.inArray(Array("EntityLinking", "EntityRecognition", "KeyPhraseExtraction",
"LanguageDetection", "PiiEntityRecognition", "SentimentAnalysis"))
)

def setKind(v: String): this.type = set(kind, v)

def getKind: String = $(kind)

override protected def shouldSkip(row: Row): Boolean = if (emptyParamData(row, text)) {
true
Expand Down
Loading

0 comments on commit 1b5df70

Please # to comment.