stackabletech
diff --git a/‎demos/demos-v1.yaml
Lines changed: 15 additions & 0 deletions b/‎demos/demos-v1.yaml
Lines changed: 15 additions & 0 deletions
diff --git a/‎demos/kafka-druid-earthquake-data/create-druid-ingestion-job.yaml
Lines changed: 1 addition & 1 deletion b/‎demos/kafka-druid-earthquake-data/create-druid-ingestion-job.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎demos/kafka-druid-earthquake-data/ingest-test-data.yaml
Lines changed: 7 additions & 6 deletions b/‎demos/kafka-druid-earthquake-data/ingest-test-data.yaml
Lines changed: 7 additions & 6 deletions
diff --git a/‎demos/kafka-druid-earthquake-data/setup-superset.yaml
Lines changed: 3 additions & 4 deletions b/‎demos/kafka-druid-earthquake-data/setup-superset.yaml
Lines changed: 3 additions & 4 deletions
diff --git a/‎demos/kafka-druid-water-level-data/create-druid-ingestion-job.yaml
Lines changed: 141 additions & 0 deletions b/‎demos/kafka-druid-water-level-data/create-druid-ingestion-job.yaml
Lines changed: 141 additions & 0 deletions
diff --git a/‎demos/kafka-druid-water-level-data/ingest-test-data.yaml
Lines changed: 96 additions & 0 deletions b/‎demos/kafka-druid-water-level-data/ingest-test-data.yaml
Lines changed: 96 additions & 0 deletions
diff --git a/‎demos/kafka-druid-water-level-data/queries.txt
Lines changed: 30 additions & 0 deletions b/‎demos/kafka-druid-water-level-data/queries.txt
Lines changed: 30 additions & 0 deletions
@@ -29,3 +29,18 @@ demos:
       - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/ingest-test-data.yaml
       - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/create-druid-ingestion-job.yaml
       - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/setup-superset.yaml
+  kafka-druid-water-level-data:
+    description:  Demo ingesting water level data into Kafka, streaming it into Druid and creating a Superset dashboard
+    documentation: https://docs.stackable.tech/stackablectl/stable/demos/kafka-druid-water-level-data.html
+    stackableStack: kafka-druid-superset-s3
+    labels:
+      - kafka
+      - druid
+      - superset
+      - minio
+      - s3
+      - water-levels
+    manifests:
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/ingest-test-data.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/create-druid-ingestion-job.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/setup-superset.yaml
@@ -8,7 +8,7 @@ spec:
     spec:
       containers:
         - name: create-druid-ingestion-job
-          image: python:3.10 # Using same image so caching can happen
+          image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
           command: ["bash", "-c", "curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor"]
           volumeMounts:
             - name: ingestion-job-spec
 
@@ -8,8 +8,8 @@ spec:
     spec:
       containers:
         - name: ingest-test-data
-          image: python:3.10
-          command: ["bash", "-c", "curl -o earthquake.csv https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv && pip install pandas==1.4.2 kafka-python3==3.0.0 && python /tmp/script/script.py"]
+          image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+          command: ["bash", "-c", "python -u /tmp/script/script.py"]
           volumeMounts:
             - name: script
               mountPath: /tmp/script
@@ -31,17 +31,18 @@ data:
     from kafka3 import KafkaProducer
     import time
 
-    BOOTSTRAP_SERVERS = "kafka:9092"
+    BOOTSTRAP_SERVERS = "kafka:9092" # For local testing / developing replace it, afterwards change back to kafka:9092
     TOPIC = "earthquakes"
-    CSV_FILE = "earthquake.csv"
+    CSV_FILE = "https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv"
     TARGET_RECORDS_PER_SECOND = 1000
 
     print(f"Producing {TARGET_RECORDS_PER_SECOND} records/s from {CSV_FILE} to topic {TOPIC} with bootstrap servers {BOOTSTRAP_SERVERS}\n")
 
-    csv_file = pd.DataFrame(pd.read_csv(CSV_FILE, sep=","))
-
+    # Create producer first to early error out if Kafka is not ready yet to reduce unnecessary network usage
     producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS)
 
+    csv_file = pd.DataFrame(pd.read_csv(CSV_FILE, sep=","))
+
     for row in csv_file.index:
       starttime = time.time()
       row_json = csv_file.loc[row].to_json()
 
@@ -8,8 +8,8 @@ spec:
     spec:
       containers:
         - name: setup-superset
-          image: python:3.10
-          command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/superset-assets.zip && pip install requests==2.22.0 && python /tmp/script/script.py"]
+          image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+          command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/superset-assets.zip && python -u /tmp/script/script.py"]
           volumeMounts:
             - name: script
               mountPath: /tmp/script
@@ -30,8 +30,7 @@ data:
     import logging
     import requests
 
-    base_url = "http://superset-external:8088"
-    # base_url = "http://172.18.0.4:31024"
+    base_url = "http://superset-external:8088" # For local testing / developing replace it, afterwards change back to http://superset-external:8088
     username = "admin"
     password = "admin"
 
 
@@ -0,0 +1,141 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: create-druid-ingestion-job
+spec:
+  template:
+    spec:
+      containers:
+        - name: create-druid-ingestion-job
+          image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+          command: ["bash", "-c", "curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/stations-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor && curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/measurements-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor"]
+          volumeMounts:
+            - name: ingestion-job-spec
+              mountPath: /tmp/ingestion-job-spec
+      restartPolicy: OnFailure
+      volumes:
+      - name: ingestion-job-spec
+        configMap:
+          name: create-druid-ingestion-job-spec
+      restartPolicy: Never
+  backoffLimit: 50 # It can take some time until Druid is ready
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: create-druid-ingestion-job-spec
+data:
+  stations-ingestion-job-spec.json: |
+    {
+      "type": "kafka",
+      "spec": {
+        "ioConfig": {
+          "type": "kafka",
+          "consumerProperties": {
+            "bootstrap.servers": "kafka:9092"
+          },
+          "topic": "stations",
+          "inputFormat": {
+            "type": "json",
+            "flattenSpec": {
+              "fields": [
+                {
+                  "name": "water_longname",
+                  "type": "path",
+                  "expr": "$.water.longname"
+                },
+                {
+                  "name": "water_shortname",
+                  "type": "path",
+                  "expr": "$.water.shortname"
+                }
+              ]
+            }
+          },
+          "useEarliestOffset": true
+        },
+        "tuningConfig": {
+          "type": "kafka"
+        },
+        "dataSchema": {
+          "dataSource": "stations",
+          "timestampSpec": {
+            "column": "!!!_no_such_column_!!!",
+            "missingValue": "2000-01-01T00:00:00Z"
+          },
+          "dimensionsSpec": {
+            "dimensions": [
+              "water_longname",
+              "water_shortname",
+              "uuid",
+              {
+                "type": "long",
+                "name": "number"
+              },
+              "shortname",
+              "longname",
+              {
+                "type": "double",
+                "name": "km"
+              },
+              "agency",
+              {
+                "type": "double",
+                "name": "longitude"
+              },
+              {
+                "type": "double",
+                "name": "latitude"
+              }
+            ]
+          },
+          "granularitySpec": {
+            "queryGranularity": "none",
+            "rollup": false,
+            "segmentGranularity": "all"
+          }
+        }
+      }
+    }
+  measurements-ingestion-job-spec.json: |
+    {
+      "type": "kafka",
+      "spec": {
+        "ioConfig": {
+          "type": "kafka",
+          "consumerProperties": {
+            "bootstrap.servers": "kafka:9092"
+          },
+          "topic": "measurements",
+          "inputFormat": {
+            "type": "json"
+          },
+          "useEarliestOffset": true
+        },
+        "tuningConfig": {
+          "type": "kafka"
+        },
+        "dataSchema": {
+          "dataSource": "measurements",
+          "timestampSpec": {
+            "column": "timestamp",
+            "format": "millis"
+          },
+          "dimensionsSpec": {
+            "dimensions": [
+              {
+                "type": "long",
+                "name": "value"
+              },
+              "station_uuid"
+            ]
+          },
+          "granularitySpec": {
+            "queryGranularity": "none",
+            "rollup": false,
+            "segmentGranularity": "day"
+          }
+        }
+      }
+    }
@@ -0,0 +1,96 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ingest-test-data
+spec:
+  template:
+    spec:
+      containers:
+        - name: ingest-test-data
+          image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+          command: ["bash", "-c", "python -u /tmp/script/script.py"]
+          volumeMounts:
+            - name: script
+              mountPath: /tmp/script
+      restartPolicy: OnFailure
+      volumes:
+      - name: script
+        configMap:
+          name: ingest-test-data-script
+      restartPolicy: Never
+  backoffLimit: 50 # It can take some time until Kafka is ready
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ingest-test-data-script
+data:
+  script.py: |
+    import json
+    from kafka3 import KafkaProducer
+    import pandas as pd
+    import time
+
+    HISTORY_DAYS = 31
+    # At a maximum scrape once every minute.
+    # In practice it will take longer than a minute to scrape, so the loop will be busy.
+    # This will result in a continuos stream of data.
+    LIVE_UPDATE_INTERVAL_S = 60
+
+    BOOTSTRAP_SERVERS = "kafka:9092" # For local testing / developing replace it, afterwards change back to kafka:9092
+    STATIONS_TOPIC = "stations"
+    MEASUREMENTS_TOPIC = "measurements"
+
+    print(f"Producing station records to topic {STATIONS_TOPIC} with bootstrap servers {BOOTSTRAP_SERVERS}\n")
+    producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS)
+
+    stations = pd.read_json("https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json")
+    print(f"Stations:\n{stations}")
+    for station in stations.index:
+        station_json = stations.loc[station].to_json()
+        producer.send(STATIONS_TOPIC, str.encode(station_json))
+
+    print(f"Producing measurement records of the last {HISTORY_DAYS} days to topic {MEASUREMENTS_TOPIC} with bootstrap servers {BOOTSTRAP_SERVERS}\n")
+
+    # Using separate loop to first send stations and the measurements afterwards
+    for station in stations.index:
+        station = stations.loc[station]
+        station_uuid = station["uuid"]
+        url = f"https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations/{station_uuid}/W/measurements.json?start=P{HISTORY_DAYS}D"
+        try:
+            measurements = pd.read_json(url)
+        except Exception as err:
+            print(f"[WARN] Could not read measurements for station {station['longname']} ({station_uuid}): {err}")
+            continue
+        measurements['station_uuid'] = station_uuid
+        for measurement in measurements.index:
+            measurement_json = measurements.loc[measurement].to_json()
+            producer.send(MEASUREMENTS_TOPIC, str.encode(measurement_json))
+
+        print(f"Send {len(measurements)} measurements for station {station['longname']}")
+
+
+    print(f"Finished loading {HISTORY_DAYS} days of historic data, now starting live streaming")
+    while True:
+        starttime = time.time()
+        measurement_counter = 0
+        measurement_failed_counter = 0
+        for station_uuid in stations["uuid"]:
+            url = f"https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations/{station_uuid}/W/currentmeasurement.json"
+            try:
+                measurement = pd.read_json(url, typ='series')
+            except Exception as err:
+                measurement_failed_counter += 1
+                continue
+
+            measurement = {
+                "timestamp": int(time.time() * 1000),
+                "value": measurement["value"],
+                "station_uuid": station_uuid
+            }
+            measurement_json = json.dumps(measurement, separators=(',', ':'))
+            producer.send(MEASUREMENTS_TOPIC, str.encode(measurement_json))
+            measurement_counter += 1
+        print(f"Send {measurement_counter} measurements in {int(time.time() - starttime)}s ({measurement_failed_counter} failed)")
+        time.sleep(max(0, LIVE_UPDATE_INTERVAL_S - ((time.time() - starttime))))
@@ -0,0 +1,30 @@
+-- For Superset datasets
+select
+  measurements.__time as __time,
+  measurements."value" as measurement,
+  stations.agency as agency,
+  stations.km as km,
+  stations.latitude as latitude,
+  stations.longitude as longitude,
+  stations.longname as longname,
+  stations.number as number,
+  stations.shortname as shortname,
+  stations.water_longname as water_longname,
+  stations.water_shortname as water_shortname,
+  measurements.station_uuid as station_uuid
+from measurements inner join stations on stations.uuid = measurements.station_uuid
+
+select * from
+(
+  select
+    station_uuid,
+    avg("value") as avg_measurement,
+    latest("value") as current_measurement,
+    (latest("value") - avg("value")) / avg("value") * 100 as deviation_percent,
+    round((latest("value") - avg("value")) / avg("value") * 100 /  5) *  5 as deviation_percent_bucketed_5,
+    round((latest("value") - avg("value")) / avg("value") * 100 / 10) * 10 as deviation_percent_bucketed_10,
+    round((latest("value") - avg("value")) / avg("value") * 100 / 25) * 25 as deviation_percent_bucketed_25
+  from measurements
+  group by 1
+) as measurements
+inner join stations on stations.uuid = measurements.station_uuid