From 4fdeb277eef7812d44cb11a48bf950d1505e9290 Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Tue, 6 Aug 2024 19:13:21 -0400 Subject: [PATCH 1/2] store: Report polling errors with details We delay reporting transient errors until we have retried a certain number of times, because they might recover on their own. Signed-off-by: Zixuan James Li --- assets/l10n/app_en.arb | 12 +++++++ lib/model/store.dart | 35 ++++++++++++++++++-- test/model/store_test.dart | 66 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) diff --git a/assets/l10n/app_en.arb b/assets/l10n/app_en.arb index b8af6e155f..1f534fe4a3 100644 --- a/assets/l10n/app_en.arb +++ b/assets/l10n/app_en.arb @@ -160,6 +160,18 @@ "message": {"type": "String", "example": "Invalid format"} } }, + "errorConnectingToServerShort": "Error connecting to Zulip. Retrying…", + "@errorConnectingToServerShort": { + "description": "Short error message for a generic unknown error connecting to the server." + }, + "errorConnectingToServerDetails": "Error connecting to Zulip at {serverUrl}. Will retry:\n\n{error}", + "@errorConnectingToServerDetails": { + "description": "Dialog error message for a generic unknown error connecting to the server with details.", + "placeholders": { + "serverUrl": {"type": "String", "example": "http://example.com/"}, + "error": {"type": "String", "example": "Invalid format"} + } + }, "errorSharingFailed": "Sharing failed", "@errorSharingFailed": { "description": "Error message when sharing a message failed." diff --git a/lib/model/store.dart b/lib/model/store.dart index 8929c45adb..94c1e5dd6e 100644 --- a/lib/model/store.dart +++ b/lib/model/store.dart @@ -22,6 +22,7 @@ import '../notifications/receive.dart'; import 'autocomplete.dart'; import 'database.dart'; import 'emoji.dart'; +import 'localizations.dart'; import 'message.dart'; import 'message_list.dart'; import 'recent_dm_conversations.dart'; @@ -912,10 +913,28 @@ class UpdateMachine { }()); } + /// This controls when we start to report transient errors to the user when + /// polling. + /// + /// At the 6th failure, the expected time elapsed since the first failure + /// will be 1.55 seocnds. + static const transientFailureCountNotifyThreshold = 5; + void poll() async { assert(!_disposed); BackoffMachine? backoffMachine; + int accumulatedTransientFailureCount = 0; + + /// This only reports transient errors after reaching + /// a pre-defined threshold of retries. + void maybeReportTransientError(String? message, {String? details}) { + accumulatedTransientFailureCount++; + if (accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) { + reportErrorToUserBriefly(message, details: details); + } + } + while (true) { if (_debugLoopSignal != null) { await _debugLoopSignal!.future; @@ -935,6 +954,8 @@ class UpdateMachine { if (_disposed) return; store.isLoading = true; + final localizations = GlobalLocalizations.zulipLocalizations; + final serverUrl = store.connection.realmUrl.origin; switch (e) { case ZulipApiException(code: 'BAD_EVENT_QUEUE_ID'): assert(debugLog('Lost event queue for $store. Replacing…')); @@ -946,7 +967,10 @@ class UpdateMachine { case Server5xxException() || NetworkException(): assert(debugLog('Transient error polling event queue for $store: $e\n' 'Backing off, then will retry…')); - // TODO tell user if transient polling errors persist + maybeReportTransientError( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); await (backoffMachine ??= BackoffMachine()).wait(); assert(debugLog('… Backoff wait complete, retrying poll.')); continue; @@ -954,7 +978,11 @@ class UpdateMachine { default: assert(debugLog('Error polling event queue for $store: $e\n' 'Backing off and retrying even though may be hopeless…')); - // TODO tell user on non-transient error in polling + // TODO(#186): Handle unrecoverable failures + reportErrorToUserBriefly( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); await (backoffMachine ??= BackoffMachine()).wait(); assert(debugLog('… Backoff wait complete, retrying poll.')); continue; @@ -978,6 +1006,9 @@ class UpdateMachine { // and failures, the successes themselves should space out the requests. backoffMachine = null; store.isLoading = false; + // Dismiss existing errors, if any. + reportErrorToUserBriefly(null); + accumulatedTransientFailureCount = 0; final events = result.events; for (final event in events) { diff --git a/test/model/store_test.dart b/test/model/store_test.dart index 1fe99fe46b..b79fe2cf4b 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -1,6 +1,7 @@ import 'dart:async'; import 'package:checks/checks.dart'; +import 'package:fake_async/fake_async.dart'; import 'package:flutter/foundation.dart'; import 'package:http/http.dart' as http; import 'package:test/scaffolding.dart'; @@ -12,6 +13,7 @@ import 'package:zulip/api/route/messages.dart'; import 'package:zulip/api/route/realm.dart'; import 'package:zulip/model/message_list.dart'; import 'package:zulip/model/narrow.dart'; +import 'package:zulip/log.dart'; import 'package:zulip/model/store.dart'; import 'package:zulip/notifications/receive.dart'; @@ -652,6 +654,70 @@ void main() { test('retries on MalformedServerResponseException', () { checkRetry(() => connection.prepare(httpStatus: 200, body: 'nonsense')); }); + + group('report error', () { + String? lastReportedError; + String? takeLastReportedError() { + final result = lastReportedError; + lastReportedError = null; + return result; + } + + /// This is an alternative to [ZulipApp]'s implementation of + /// [reportErrorToUserBriefly] for testing. + Future logAndReportErrorToUserBriefly(String? message, { + String? details, + }) async { + if (message == null) return; + lastReportedError = '$message\n$details'; + } + + Future prepare() async { + reportErrorToUserBriefly = logAndReportErrorToUserBriefly; + addTearDown(() => reportErrorToUserBriefly = defaultReportErrorToUserBriefly); + + await prepareStore(lastEventId: 1); + updateMachine.debugPauseLoop(); + updateMachine.poll(); + } + + void pollAndFail(FakeAsync async) { + updateMachine.debugAdvanceLoop(); + async.elapse(Duration.zero); + checkLastRequest(lastEventId: 1); + check(store).isLoading.isTrue(); + } + + test('report non-transient errors', () => awaitFakeAsync((async) async { + await prepare(); + + connection.prepare(httpStatus: 400, json: { + 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'}); + pollAndFail(async); + check(takeLastReportedError()).isNotNull().startsWith( + "Error connecting to Zulip. Retrying…\n" + "Error connecting to Zulip at"); + })); + + test('report transient errors', () => awaitFakeAsync((async) async { + await prepare(); + + // There should be no user visible error messages during these retries. + for (int i = 0; i < UpdateMachine.transientFailureCountNotifyThreshold; i++) { + connection.prepare(httpStatus: 500, body: 'splat'); + pollAndFail(async); + check(takeLastReportedError()).isNull(); + // This skips the pending polling backoff. + async.flushTimers(); + } + + connection.prepare(httpStatus: 500, body: 'splat'); + pollAndFail(async); + check(takeLastReportedError()).isNotNull().startsWith( + "Error connecting to Zulip. Retrying…\n" + "Error connecting to Zulip at"); + })); + }); }); group('UpdateMachine.registerNotificationToken', () { From 33c365af73c8e2a9342d71b16a6fc967e6a35a84 Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Fri, 27 Sep 2024 17:46:50 -0400 Subject: [PATCH 2/2] store: Ignore boring NetworkExceptions Usually when a client goes back from sleep or lose network connection, there will be a bunch of errors when polling event queue. This known error will always be a `NetworkException`, so we make a separate case for it. Previously, we may report these errors, and it ended up being spammy. This filters out the more interesting one to report instead. Fixes: #555 Signed-off-by: Zixuan James Li --- lib/model/store.dart | 17 ++++++++++++++++- test/model/store_test.dart | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/lib/model/store.dart b/lib/model/store.dart index 94c1e5dd6e..63f846bf6a 100644 --- a/lib/model/store.dart +++ b/lib/model/store.dart @@ -964,7 +964,7 @@ class UpdateMachine { debugLog('… Event queue replaced.'); return; - case Server5xxException() || NetworkException(): + case Server5xxException(): assert(debugLog('Transient error polling event queue for $store: $e\n' 'Backing off, then will retry…')); maybeReportTransientError( @@ -975,6 +975,21 @@ class UpdateMachine { assert(debugLog('… Backoff wait complete, retrying poll.')); continue; + case NetworkException(): + assert(debugLog('Transient error polling event queue for $store: $e\n' + 'Backing off, then will retry…')); + if (e.cause is! SocketException) { + // Heuristic check to only report interesting errors to the user. + // A [SocketException] is common when the app returns from sleep. + maybeReportTransientError( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); + } + await (backoffMachine ??= BackoffMachine()).wait(); + assert(debugLog('… Backoff wait complete, retrying poll.')); + continue; + default: assert(debugLog('Error polling event queue for $store: $e\n' 'Backing off and retrying even though may be hopeless…')); diff --git a/test/model/store_test.dart b/test/model/store_test.dart index b79fe2cf4b..6819dcfb6a 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -1,4 +1,5 @@ import 'dart:async'; +import 'dart:io'; import 'package:checks/checks.dart'; import 'package:fake_async/fake_async.dart'; @@ -717,6 +718,24 @@ void main() { "Error connecting to Zulip. Retrying…\n" "Error connecting to Zulip at"); })); + + test('ignore boring errors', () => awaitFakeAsync((async) async { + await prepare(); + + for (int i = 0; i < UpdateMachine.transientFailureCountNotifyThreshold; i++) { + connection.prepare(exception: const SocketException('failed')); + pollAndFail(async); + check(takeLastReportedError()).isNull(); + // This skips the pending polling backoff. + async.flushTimers(); + } + + connection.prepare(exception: const SocketException('failed')); + pollAndFail(async); + // Normally we start showing user visible error messages for transient + // errors after enough number of retries. + check(takeLastReportedError()).isNull(); + })); }); });