Skip to content

Commit 041f844

Browse files
authored
Merge pull request #1160 from MTES-MCT/feat-diff-ocsge
fix(diff): readd a simple diff tool for the download page
2 parents 4cec7a4 + 2ddaeb9 commit 041f844

File tree

1 file changed

+15
-68
lines changed

1 file changed

+15
-68
lines changed

airflow/dags/diff_ocsge_download_page_to_mattermost.py

Lines changed: 15 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,13 @@
44
"""
55

66
import difflib
7-
from logging import getLogger
87

98
import requests
109
from bs4 import BeautifulSoup
1110
from include.domain.container import Container
1211
from pendulum import datetime
1312

1413
from airflow.decorators import dag, task
15-
from airflow.exceptions import AirflowSkipException
16-
17-
logger = getLogger(__name__)
18-
19-
20-
def get_feed_by_page(page: int):
21-
feed_url = f"https://data.geopf.fr/telechargement/resource/OCSGE?limit=50&page={page}"
22-
feed = requests.get(feed_url)
23-
return feed.text
2414

2515

2616
@dag(
@@ -33,71 +23,32 @@ def get_feed_by_page(page: int):
3323
)
3424
def diff_ocsge_download_page_to_mattermost():
3525
@task.python
36-
def download_feed():
37-
feed = get_feed_by_page(1)
38-
soup = BeautifulSoup(feed, features="xml")
39-
40-
page_count = int(soup.select("feed")[0]["gpf_dl:pagecount"])
41-
entry_count = soup.select("feed")[0]["gpf_dl:totalentries"]
42-
43-
logger.info(f"Found {entry_count} entries in {page_count} pages")
44-
45-
feed_as_string = ""
46-
47-
dict_entries = []
48-
49-
for page in range(1, page_count + 1):
50-
logger.info(f"Downloading page {page}")
51-
feed = get_feed_by_page(page)
52-
soup = BeautifulSoup(feed, features="xml")
53-
entries = soup.select("entry")
54-
for entry in entries:
55-
updated = entry.find("updated").text
56-
link = entry.find("link")["href"]
57-
departement = entry.find("gpf_dl:zone")["label"]
58-
_format = entry.find("gpf_dl:format")["label"]
59-
dict_entries.append(
60-
{
61-
"departement": departement,
62-
"link": link,
63-
"format": _format,
64-
"updated": updated,
65-
}
66-
)
67-
68-
dict_entries.sort(key=lambda x: x.get("departement"))
69-
70-
for entry in dict_entries:
71-
feed_as_string += entry.get("departement") + "\n"
72-
feed_as_string += entry.get("link") + "\n"
73-
feed_as_string += entry.get("format") + "\n"
74-
feed_as_string += entry.get("updated") + "\n\n"
75-
76-
return feed_as_string
77-
78-
@task.python
79-
def generate_diff(current_feed: str) -> str:
80-
s3_path = "airflow-staging/simplified_ocsge_atom_feed.txt"
81-
local_path = "simplified_ocsge_atom_feed.txt"
26+
def diff():
27+
url = "https://geoservices.ign.fr/artificialisation-ocs-ge#telechargement"
28+
selector = "#block-ignpro-content > div > article > div.container > div:nth-child(2) > div" # noqa: E501
29+
s3_path = "airflow-staging/download_page_ocsge_artif.txt"
30+
local_path = "download_page_ocsge.txt"
8231

8332
if Container().s3().exists(s3_path):
8433
Container().s3().get_file(s3_path, local_path)
8534
with open(local_path, "r") as f:
86-
previous_feed = f.read()
35+
previous_txt = f.read()
8736
else:
88-
previous_feed = ""
37+
previous_txt = ""
38+
39+
new_html = requests.get(url).text
40+
new_soup = BeautifulSoup(new_html, features="html.parser")
41+
new_txt = new_soup.select(selector)[0].text.strip()
8942

90-
diff = difflib.unified_diff(a=previous_feed.splitlines(), b=current_feed.splitlines())
43+
diff = difflib.unified_diff(previous_txt.splitlines(), new_txt.splitlines())
9144

9245
with open(local_path, "w") as f:
93-
f.write(current_feed)
46+
f.write(new_txt)
9447

9548
Container().s3().put_file(local_path, s3_path)
9649

97-
return "\n".join(diff)
50+
diff_str = "\n".join(diff)
9851

99-
@task.python
100-
def send_diff_to_mattermost(diff_str: str):
10152
if diff_str:
10253
markdown_message = "\n".join(
10354
[
@@ -107,12 +58,8 @@ def send_diff_to_mattermost(diff_str: str):
10758
]
10859
)
10960
Container().notification().send(message=markdown_message)
110-
else:
111-
raise AirflowSkipException("No difference found")
11261

113-
current_feed = download_feed()
114-
diff_str = generate_diff(current_feed)
115-
send_diff_to_mattermost(diff_str)
62+
diff()
11663

11764

11865
diff_ocsge_download_page_to_mattermost()

0 commit comments

Comments
 (0)