@@ -14,15 +14,11 @@ package com.snowplowanalytics.s3.loader.processing
14
14
15
15
import java .time .Instant
16
16
import java .nio .charset .StandardCharsets .UTF_8
17
-
18
17
import cats .syntax .either ._
19
-
20
18
import io .circe .parser .parse
21
-
22
19
import com .snowplowanalytics .iglu .core .SchemaKey
23
20
import com .snowplowanalytics .iglu .core .circe .implicits ._
24
-
25
- import com .snowplowanalytics .s3 .loader .Result
21
+ import com .snowplowanalytics .s3 .loader .{ParsedResult , Result }
26
22
import com .snowplowanalytics .s3 .loader .Config .Purpose
27
23
import com .snowplowanalytics .s3 .loader .monitoring .StatsD .CollectorTstampIdx
28
24
@@ -39,15 +35,26 @@ object Common {
39
35
* @param records raw records themselves
40
36
*/
41
37
def partition (
42
- purpose : Purpose ,
43
- statsDEnabled : Boolean ,
44
- records : List [Result ]
38
+ purpose : Purpose ,
39
+ partitionTsvByApp : Boolean ,
40
+ statsDEnabled : Boolean ,
41
+ records : List [Result ]
45
42
): Batch .Partitioned =
46
43
purpose match {
47
44
case Purpose .SelfDescribingJson =>
48
45
Batch .from(records).map(rs => partitionByType(rs).toList)
49
- case Purpose .Enriched if statsDEnabled =>
50
- Batch .fromEnriched(records).map(rs => List ((RowType .Unpartitioned , rs)))
46
+ case Purpose .Enriched =>
47
+ val parsed = records.map(toParsedRecord(_, actuallyParse = statsDEnabled || partitionTsvByApp))
48
+ val batch = if (statsDEnabled)
49
+ Batch .fromEnriched(parsed)
50
+ else
51
+ Batch .from(parsed)
52
+ if (partitionTsvByApp)
53
+ batch.map(rs => partitionByApp(rs).toList.map {
54
+ case (row, records) => (row, records.map(fromParsedRecord))
55
+ })
56
+ else
57
+ batch.map(rs => List ((RowType .Unpartitioned , rs.map(fromParsedRecord))))
51
58
case _ =>
52
59
Batch .from(records).map(rs => List ((RowType .Unpartitioned , rs)))
53
60
}
@@ -70,9 +77,25 @@ object Common {
70
77
case Left (_) => RowType .ReadingError
71
78
}
72
79
80
+ def toParsedRecord (record : Result , actuallyParse : Boolean ): ParsedResult =
81
+ record.map { byteArray =>
82
+ val parsed = if (actuallyParse) Some (new String (byteArray, UTF_8 ).split(" \t " , - 1 )) else None
83
+ (byteArray, parsed)
84
+ }
85
+
86
+ def fromParsedRecord (record : ParsedResult ): Result = record.map(_._1)
87
+
88
+ def partitionByApp (records : List [ParsedResult ]): Map [RowType , List [ParsedResult ]] =
89
+ records.groupBy {
90
+ case Right ((_, array)) =>
91
+ // if there are no tabs, avoid returning the whole string
92
+ val appId = array.flatMap(_.headOption.filter(_ => array.size > 1 ))
93
+ appId.fold[RowType ](RowType .Unpartitioned )(RowType .TsvPerApp )
94
+ case Left (_) => RowType .ReadingError
95
+ }
96
+
73
97
/** Extract a timestamp from enriched TSV line */
74
- def getTstamp (row : String ): Either [RuntimeException , Instant ] = {
75
- val array = row.split(" \t " , - 1 )
98
+ def getTstamp (array : Array [String ]): Either [RuntimeException , Instant ] = {
76
99
for {
77
100
string <- Either
78
101
.catchOnly[IndexOutOfBoundsException ](array(CollectorTstampIdx ))
0 commit comments