fix: higher application blocking latency precision (#1676)

mutianf · web-flow · commit 45ce93bb5211 · 2023-03-23T18:08:12.000Z
Use nano seconds for application blocking latency to reduce rounding errors from starting and stopping the timer within a millisecond. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/java-bigtable/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> ☕️ If you write sample code, please follow the [samples format]( https://github.com/GoogleCloudPlatform/java-docs-samples/blob/main/SAMPLE_FORMAT.md).
diff --git a/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/stub/metrics/BuiltinMetricsTracer.java b/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/stub/metrics/BuiltinMetricsTracer.java
@@ -55,7 +55,7 @@ class BuiltinMetricsTracer extends BigtableTracer {
   // Total server latency needs to be atomic because it's accessed from different threads. E.g.
   // request() from user thread and attempt failed from grpc thread. We're only measuring the extra
   // time application spent blocking grpc buffer, which will be operationLatency - serverLatency.
-  private final AtomicLong totalServerLatency = new AtomicLong(0);
+  private final AtomicLong totalServerLatencyNano = new AtomicLong(0);
   // Stopwatch is not thread safe so this is a workaround to check if the stopwatch changes is
   // flushed to memory.
   private final Stopwatch serverLatencyTimer = Stopwatch.createUnstarted();
@@ -171,7 +171,7 @@ public void responseReceived() {
     // In all the cases, we want to stop the serverLatencyTimer here.
     synchronized (timerLock) {
       if (serverLatencyTimerIsRunning) {
-        totalServerLatency.addAndGet(serverLatencyTimer.elapsed(TimeUnit.MILLISECONDS));
+        totalServerLatencyNano.addAndGet(serverLatencyTimer.elapsed(TimeUnit.NANOSECONDS));
         serverLatencyTimer.reset();
         serverLatencyTimerIsRunning = false;
       }
@@ -233,6 +233,7 @@ private void recordOperationCompletion(@Nullable Throwable status) {
     }
     operationTimer.stop();
     long operationLatency = operationTimer.elapsed(TimeUnit.MILLISECONDS);
+    long operationLatencyNano = operationTimer.elapsed(TimeUnit.NANOSECONDS);
 
     // Only record when retry count is greater than 0 so the retry
     // graph will be less confusing
@@ -242,7 +243,8 @@ private void recordOperationCompletion(@Nullable Throwable status) {
 
     // serverLatencyTimer should already be stopped in recordAttemptCompletion
     recorder.putOperationLatencies(operationLatency);
-    recorder.putApplicationLatencies(operationLatency - totalServerLatency.get());
+    recorder.putApplicationLatencies(
+        Duration.ofNanos(operationLatencyNano - totalServerLatencyNano.get()).toMillis());
 
     if (operationType == OperationType.ServerStreaming
         && spanName.getMethodName().equals("ReadRows")) {
@@ -258,7 +260,7 @@ private void recordAttemptCompletion(@Nullable Throwable status) {
     synchronized (timerLock) {
       if (serverLatencyTimerIsRunning) {
         requestLeft.decrementAndGet();
-        totalServerLatency.addAndGet(serverLatencyTimer.elapsed(TimeUnit.MILLISECONDS));
+        totalServerLatencyNano.addAndGet(serverLatencyTimer.elapsed(TimeUnit.NANOSECONDS));
         serverLatencyTimer.reset();
         serverLatencyTimerIsRunning = false;
       }