44"""
55
66import difflib
7- from logging import getLogger
87
98import requests
109from bs4 import BeautifulSoup
1110from include .domain .container import Container
1211from pendulum import datetime
1312
1413from airflow .decorators import dag , task
15- from airflow .exceptions import AirflowSkipException
16-
17- logger = getLogger (__name__ )
18-
19-
20- def get_feed_by_page (page : int ):
21- feed_url = f"https://data.geopf.fr/telechargement/resource/OCSGE?limit=50&page={ page } "
22- feed = requests .get (feed_url )
23- return feed .text
2414
2515
2616@dag (
@@ -33,71 +23,32 @@ def get_feed_by_page(page: int):
3323)
3424def diff_ocsge_download_page_to_mattermost ():
3525 @task .python
36- def download_feed ():
37- feed = get_feed_by_page (1 )
38- soup = BeautifulSoup (feed , features = "xml" )
39-
40- page_count = int (soup .select ("feed" )[0 ]["gpf_dl:pagecount" ])
41- entry_count = soup .select ("feed" )[0 ]["gpf_dl:totalentries" ]
42-
43- logger .info (f"Found { entry_count } entries in { page_count } pages" )
44-
45- feed_as_string = ""
46-
47- dict_entries = []
48-
49- for page in range (1 , page_count + 1 ):
50- logger .info (f"Downloading page { page } " )
51- feed = get_feed_by_page (page )
52- soup = BeautifulSoup (feed , features = "xml" )
53- entries = soup .select ("entry" )
54- for entry in entries :
55- updated = entry .find ("updated" ).text
56- link = entry .find ("link" )["href" ]
57- departement = entry .find ("gpf_dl:zone" )["label" ]
58- _format = entry .find ("gpf_dl:format" )["label" ]
59- dict_entries .append (
60- {
61- "departement" : departement ,
62- "link" : link ,
63- "format" : _format ,
64- "updated" : updated ,
65- }
66- )
67-
68- dict_entries .sort (key = lambda x : x .get ("departement" ))
69-
70- for entry in dict_entries :
71- feed_as_string += entry .get ("departement" ) + "\n "
72- feed_as_string += entry .get ("link" ) + "\n "
73- feed_as_string += entry .get ("format" ) + "\n "
74- feed_as_string += entry .get ("updated" ) + "\n \n "
75-
76- return feed_as_string
77-
78- @task .python
79- def generate_diff (current_feed : str ) -> str :
80- s3_path = "airflow-staging/simplified_ocsge_atom_feed.txt"
81- local_path = "simplified_ocsge_atom_feed.txt"
26+ def diff ():
27+ url = "https://geoservices.ign.fr/artificialisation-ocs-ge#telechargement"
28+ selector = "#block-ignpro-content > div > article > div.container > div:nth-child(2) > div" # noqa: E501
29+ s3_path = "airflow-staging/download_page_ocsge_artif.txt"
30+ local_path = "download_page_ocsge.txt"
8231
8332 if Container ().s3 ().exists (s3_path ):
8433 Container ().s3 ().get_file (s3_path , local_path )
8534 with open (local_path , "r" ) as f :
86- previous_feed = f .read ()
35+ previous_txt = f .read ()
8736 else :
88- previous_feed = ""
37+ previous_txt = ""
38+
39+ new_html = requests .get (url ).text
40+ new_soup = BeautifulSoup (new_html , features = "html.parser" )
41+ new_txt = new_soup .select (selector )[0 ].text .strip ()
8942
90- diff = difflib .unified_diff (a = previous_feed .splitlines (), b = current_feed .splitlines ())
43+ diff = difflib .unified_diff (previous_txt .splitlines (), new_txt .splitlines ())
9144
9245 with open (local_path , "w" ) as f :
93- f .write (current_feed )
46+ f .write (new_txt )
9447
9548 Container ().s3 ().put_file (local_path , s3_path )
9649
97- return "\n " .join (diff )
50+ diff_str = "\n " .join (diff )
9851
99- @task .python
100- def send_diff_to_mattermost (diff_str : str ):
10152 if diff_str :
10253 markdown_message = "\n " .join (
10354 [
@@ -107,12 +58,8 @@ def send_diff_to_mattermost(diff_str: str):
10758 ]
10859 )
10960 Container ().notification ().send (message = markdown_message )
110- else :
111- raise AirflowSkipException ("No difference found" )
11261
113- current_feed = download_feed ()
114- diff_str = generate_diff (current_feed )
115- send_diff_to_mattermost (diff_str )
62+ diff ()
11663
11764
11865diff_ocsge_download_page_to_mattermost ()
0 commit comments