element-hq · MadLittleMods · Apr 15, 2025 · Apr 29, 2025 · Apr 30, 2025 · Apr 30, 2025
@@ -0,0 +1 @@
+Add `total_event_count`, `total_message_count`, and `total_e2ee_event_count` fields to the homeserver usage statistics.
@@ -30,10 +30,13 @@ The following statistics are sent to the configured reporting endpoint:
 | `python_version`           | string | The Python version number in use (e.g "3.7.1"). Taken from `sys.version_info`.                                                                                                                                                                                                                  |
 | `total_users`              | int    | The number of registered users on the homeserver.                                                                                                                                                                                                                                               |
 | `total_nonbridged_users`   | int    | The number of users, excluding those created by an Application Service.                                                                                                                                                                                                                         |
-| `daily_user_type_native`   | int    | The number of native users created in the last 24 hours.                                                                                                                                                                                                                                        |
+| `daily_user_type_native`   | int    | The number of native, non-guest users created in the last 24 hours.                                                                                                                                                                                                                                        |
 | `daily_user_type_guest`    | int    | The number of guest users created in the last 24 hours.                                                                                                                                                                                                                                         |
 | `daily_user_type_bridged`  | int    | The number of users created by Application Services in the last 24 hours.                                                                                                                                                                                                                       |
 | `total_room_count`         | int    | The total number of rooms present on the homeserver.                                                                                                                                                                                                                                            |
+| `total_event_count`        | int    | The total number of events present on the homeserver.                                                                                                                                                                                                      |
+| `total_message_count`      | int    | The total number of non-state events with type `m.room.message` present on the homeserver.                                                                                                                                                                                                      |
+| `total_e2ee_event_count`   | int    | The total number of non-state events with type `m.room.encrypted` present on the homeserver. This can be used as a slight over-estimate for the number of encrypted messages.                                                                                                                   |
 | `daily_active_users`       | int    | The number of unique users[^1] that have used the homeserver in the last 24 hours.                                                                                                                                                                                                              |
 | `monthly_active_users`     | int    | The number of unique users[^1] that have used the homeserver in the last 30 days.                                                                                                                                                                                                               |
 | `daily_active_rooms`       | int    | The number of rooms that have had a (state) event with the type `m.room.message` sent in them in the last 24 hours.                                                                                                                                                                             |
@@ -50,8 +53,8 @@ The following statistics are sent to the configured reporting endpoint:
 | `cache_factor`             | int    | The configured [`global factor`](../../configuration/config_documentation.md#caching) value for caching.                                                                                                                                                                                        |
 | `event_cache_size`         | int    | The configured [`event_cache_size`](../../configuration/config_documentation.md#caching) value for caching.                                                                                                                                                                                     |
 | `database_engine`          | string | The database engine that is in use. Either "psycopg2" meaning PostgreSQL is in use, or "sqlite3" for SQLite3.                                                                                                                                                                                   |
-| `database_server_version` | string | The version of the database server. Examples being "10.10" for PostgreSQL server version 10.0, and "3.38.5" for SQLite 3.38.5 installed on the system.                                                                                                                                          |
-| `log_level` | string | The log level in use. Examples are "INFO", "WARNING", "ERROR", "DEBUG", etc.                                                                                                                                                                                                                    |
+| `database_server_version`  | string | The version of the database server. Examples being "10.10" for PostgreSQL server version 10.0, and "3.38.5" for SQLite 3.38.5 installed on the system.                                                                                                                                          |
+| `log_level`                | string | The log level in use. Examples are "INFO", "WARNING", "ERROR", "DEBUG", etc.                                                                                                                                                                                                                    |
 
 
 [^1]: Native matrix users and guests are always counted. If the

@@ -34,6 +34,22 @@
 
 logger = logging.getLogger("synapse.app.homeserver")
 
+ONE_MINUTE_SECONDS = 60
+ONE_HOUR_SECONDS = 60 * ONE_MINUTE_SECONDS
+
+MILLISECONDS_PER_SECOND = 1000
+
+INITIAL_DELAY_BEFORE_FIRST_PHONE_HOME_SECONDS = 5 * ONE_MINUTE_SECONDS
+"""
+We wait 5 minutes to send the first set of stats as the server can be quite busy the
+first few minutes
+"""
+
+PHONE_HOME_INTERVAL_SECONDS = 3 * ONE_HOUR_SECONDS
+"""
+Phone home stats are sent every 3 hours
+"""
+
 # Contains the list of processes we will be monitoring
 # currently either 0 or 1
 _stats_process: List[Tuple[int, "resource.struct_rusage"]] = []
@@ -121,6 +137,9 @@ async def phone_stats_home(
 
     room_count = await store.get_room_count()
     stats["total_room_count"] = room_count
+    stats["total_event_count"] = await store.count_total_events()
+    stats["total_message_count"] = await store.count_total_messages()
+    stats["total_e2ee_event_count"] = await store.count_total_e2ee_events()
 
     stats["daily_active_users"] = common_metrics.daily_active_users
     stats["monthly_active_users"] = await store.count_monthly_users()
@@ -185,12 +204,14 @@ def performance_stats_init() -> None:
     # If you increase the loop period, the accuracy of user_daily_visits
     # table will decrease
     clock.looping_call(
-        hs.get_datastores().main.generate_user_daily_visits, 5 * 60 * 1000
+        hs.get_datastores().main.generate_user_daily_visits,
+        5 * ONE_MINUTE_SECONDS * MILLISECONDS_PER_SECOND,
     )
 
     # monthly active user limiting functionality
     clock.looping_call(
-        hs.get_datastores().main.reap_monthly_active_users, 1000 * 60 * 60
+        hs.get_datastores().main.reap_monthly_active_users,
+        ONE_HOUR_SECONDS * MILLISECONDS_PER_SECOND,
     )
     hs.get_datastores().main.reap_monthly_active_users()
 
@@ -216,17 +237,27 @@ async def generate_monthly_active_users() -> None:
 
     if hs.config.server.limit_usage_by_mau or hs.config.server.mau_stats_only:
         generate_monthly_active_users()
-        clock.looping_call(generate_monthly_active_users, 5 * 60 * 1000)
+        clock.looping_call(
+            generate_monthly_active_users,
+            5 * ONE_MINUTE_SECONDS * MILLISECONDS_PER_SECOND,
+        )
     # End of monthly active user settings
 
     if hs.config.metrics.report_stats:
         logger.info("Scheduling stats reporting for 3 hour intervals")
-        clock.looping_call(phone_stats_home, 3 * 60 * 60 * 1000, hs, stats)
+        clock.looping_call(
+            phone_stats_home,
+            PHONE_HOME_INTERVAL_SECONDS * MILLISECONDS_PER_SECOND,
+            hs,
+            stats,
+        )
 
         # We need to defer this init for the cases that we daemonize
         # otherwise the process ID we get is that of the non-daemon process
         clock.call_later(0, performance_stats_init)
 
         # We wait 5 minutes to send the first set of stats as the server can
         # be quite busy the first few minutes
-        clock.call_later(5 * 60, phone_stats_home, hs, stats)
+        clock.call_later(
+            INITIAL_DELAY_BEFORE_FIRST_PHONE_HOME_SECONDS, phone_stats_home, hs, stats
+        )
@@ -378,6 +378,8 @@ async def _do(
     ) -> Tuple[int, JsonDict]:
         content = parse_json_object_from_request(request)
 
+        logger.info("asdf event send in %s (%s)", room_id, content)
+
         origin_server_ts = None
         if requester.app_service:
             origin_server_ts = parse_integer(request, "ts")
@@ -419,6 +421,8 @@ async def _do(
         except ShadowBanError:
             event_id = generate_fake_event_id()
 
+        logger.info("asdf event send DONE in %s (%s) -> %s", room_id, content, event_id)
+
         set_tag("event_id", event_id)
         return 200, {"event_id": event_id}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Add `total_event_count`, `total_message_count`, and `total_e2ee_event_count` fields to the homeserver usage statistics.
Copy link Contributor Author MadLittleMods May 6, 2025 • edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Workaround We already report `synapse_storage_events_persisted_events` and `synapse_storage_events_persisted_events_sep` metrics which end up with a `_total` suffix when reported so you will see `synapse_storage_events_persisted_events_sep_total` for example. Don't be confused by this `_total` suffix as it's just a Prometheus convention for counters and it tracks cumulative increments, not the absolute event count in Synapse We additionally have a set of Prometheus rules to collect these metrics as `synapse_storage_events_persisted_by_event_type` Which can be presented in Grafana, Grafana panel JSON { "datasource": { "uid": "$datasource", "type": "prometheus" }, "fieldConfig": { "defaults": { "custom": { "lineWidth": 1, "fillOpacity": 80, "gradientMode": "none", "axisPlacement": "auto", "axisLabel": "", "axisColorMode": "text", "axisBorderShow": false, "scaleDistribution": { "type": "linear" }, "axisCenteredZero": false, "hideFrom": { "tooltip": false, "viz": false, "legend": false }, "thresholdsStyle": { "mode": "off" } }, "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "min": 0, "unit": "none" }, "overrides": [ { "__systemRef": "hideSeriesFrom", "matcher": { "id": "byNames", "options": { "mode": "exclude", "names": [ "m.room.encrypted" ], "prefix": "All except:", "readOnly": true } }, "properties": [ { "id": "custom.hideFrom", "value": { "viz": true, "legend": false, "tooltip": false } } ] } ] }, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 37 }, "id": 46, "options": { "orientation": "auto", "xTickLabelRotation": 0, "xTickLabelSpacing": 0, "showValue": "auto", "stacking": "normal", "groupWidth": 0.85, "barWidth": 0.97, "barRadius": 0, "fullHighlight": false, "tooltip": { "mode": "single", "sort": "none" }, "legend": { "showLegend": true, "displayMode": "list", "placement": "bottom", "calcs": [] } }, "pluginVersion": "10.4.1", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "sum by (type) (increase(synapse_storage_events_persisted_by_event_type{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[1d]))", "format": "time_series", "instant": false, "intervalFactor": 1, "legendFormat": "__auto", "refId": "A", "step": 20, "interval": "1d" } ], "title": "Events per day by Type", "type": "barchart" } If you retain all of your Prometheus data forever, you can then go one step further and create absolute counts from this but it's usually common practice to only persist the last X months of metrics. To get absolute values, you will have to manually keep track and sum these daily counts. Given that this PR would require further effort to figure out the concurrency problems or a refactor to a different solution, we're going to opt to close it in favor of this workaround. The people having to collate these reports, manually collect data anyway so we will just have them continue with the database queries or these metrics. Overall, this kind of feature seems fine but turned out to be a lot more effort than originally planned and we don't want to continue if people can live without it for now.