|
| 1 | +## code to prepare `DATASET` dataset goes here |
| 2 | +####"inst/extdata/fisbroker.html = "view-source:https://fbinter.stadt-berlin.de/fb/gisbroker.do;jsessionid=C0EDCE1624C9A8877CB10BFE1B3C74EE?cmd=navigationFrameResult" |
| 3 | + |
| 4 | +get_jsessionid <- function(string) { |
| 5 | + jsessionid <- stringr::str_extract(string, "jsessionid=.*\\?.*") |
| 6 | + |
| 7 | + jsessionid[!is.na(jsessionid )] %>% |
| 8 | + stringr::str_remove("\\?.*") %>% |
| 9 | + stringr::str_remove(".*jsessionid=") |
| 10 | +} |
| 11 | + |
| 12 | +tabelle <- xml2::read_html("inst/extdata/fisbroker.html", encoding = "UTF-8") %>% |
| 13 | + rvest::html_element(css = "table.nav_tabelle") |
| 14 | + |
| 15 | +tabelle %>% |
| 16 | + rvest::html_elements(css = "tr.kategorie") |
| 17 | + |
| 18 | +x <- tabelle %>% |
| 19 | + rvest::html_elements(css = "tr") |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | +is_category <- stringr::str_detect(x, "class=\"kategorie\"") |
| 24 | + |
| 25 | +categories_df <- tibble::tibble( |
| 26 | + idx = which(is_category), |
| 27 | + category_id = seq_len(length(idx)), |
| 28 | + category_name = rvest::html_text(x[idx]) |
| 29 | +) |
| 30 | + |
| 31 | +is_dataset <- !is_category |
| 32 | + |
| 33 | +datasets_df <- tibble::tibble(idx = which(is_dataset), |
| 34 | + dataset_id = seq_len(length(idx)), |
| 35 | + dataset_name_raw = rvest::html_text(x[idx])) |
| 36 | + |
| 37 | + |
| 38 | +datasets_text_list <- stats::setNames(lapply(datasets_df$idx, function(idx) { |
| 39 | + elements <- x[idx] %>% |
| 40 | + rvest::html_elements(css = "a.standard") |
| 41 | + |
| 42 | + elements_text <- rvest::html_text(elements) |
| 43 | + elements_href <- rvest::html_attr(elements, "href") %>% |
| 44 | + stringr::str_replace(get_jsessionid(.), |
| 45 | + "<jsessionid>") |
| 46 | + |
| 47 | + n_elements <- length(elements) |
| 48 | + |
| 49 | + if(n_elements == 1) { |
| 50 | + tibble::tibble(dataset_name = elements_text[1], |
| 51 | + dataset_name_href = elements_href[1]) |
| 52 | + } else if (n_elements > 1) { |
| 53 | + dataset_name <- rvest::html_text(elements[1]) |
| 54 | + dataset_types <- rvest::html_text(elements[2:n_elements]) |
| 55 | + |
| 56 | + tibble::tibble(dataset_name = elements_text[1], |
| 57 | + dataset_name_href = elements_href[1], |
| 58 | + dataset_type = elements_text[2:n_elements], |
| 59 | + dataset_type_href = elements_href[2:n_elements]) |
| 60 | + } else { |
| 61 | + stop("no element found") |
| 62 | + } |
| 63 | + |
| 64 | + |
| 65 | +}),datasets_df$idx) |
| 66 | + |
| 67 | +fb_datasets <- dplyr::bind_rows(datasets_text_list, .id = "idx") %>% |
| 68 | + dplyr::mutate(idx = as.integer(.data$idx)) |
| 69 | + |
| 70 | +fb_datasets <- categories_df %>% |
| 71 | + dplyr::full_join(fb_datasets, by = "idx") %>% |
| 72 | + dplyr::arrange(.data$idx) %>% |
| 73 | + tidyr::fill(.data$category_id, |
| 74 | + .data$category_name) %>% |
| 75 | + dplyr::filter(!is.na(.data$dataset_name)) %>% |
| 76 | + dplyr::left_join(datasets_df[,c("idx", "dataset_id")], by = "idx") %>% |
| 77 | + dplyr::select(- .data$idx) %>% |
| 78 | + dplyr::relocate(.data$dataset_id, .before = .data$dataset_name) |
| 79 | + |
| 80 | +usethis::use_data(fb_datasets, overwrite = TRUE) |
0 commit comments