Skip to content

flattening out the parse table #111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions R/nested.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,20 @@ add_terminal_token_before <- function(pd_flat) {
#'
#' @param spaces_after_prefix An integer vector with the number of spaces
#' after the prefix.
#' @param text_length Integer vector giving the number of characters of
#' the text.
#' @param force_one Whether spaces_after_prefix should be set to one in all
#' cases.
#' @return An integer vector of length spaces_after_prefix, which is either
#' one (if `force_one = TRUE`) or `space_after_prefix` with all values
#' below one set to one.
set_spaces <- function(spaces_after_prefix, force_one) {
set_spaces <- function(spaces_after_prefix, text_length, force_one) {
if (force_one) {
n_of_spaces <- rep(1, length(spaces_after_prefix))
} else {
n_of_spaces <- pmax(spaces_after_prefix, 1L)
}
n_of_spaces
ifelse(text_length > 0, n_of_spaces, 0)
}

#' Nest a flat parse table
Expand Down
2 changes: 1 addition & 1 deletion R/parsed.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ create_filler <- function(pd_flat) {
pd_flat$line3 <- lead(pd_flat$line1, default = tail(pd_flat$line2, 1))
pd_flat$col3 <- lead(pd_flat$col1, default = tail(pd_flat$col2, 1) + 1L)
pd_flat$newlines <- pd_flat$line3 - pd_flat$line2
pd_flat$lag_newlines <- lag(pd_flat$newlines, default = 0)
pd_flat$lag_newlines <- lag(pd_flat$newlines, default = 0L)
pd_flat$col2_nl <- if_else(pd_flat$newlines > 0L, 0L, pd_flat$col2)
pd_flat$spaces <- pd_flat$col3 - pd_flat$col2_nl - 1L
pd_flat$multi_line <- ifelse(pd_flat$terminal, FALSE, NA)
Expand Down
4 changes: 2 additions & 2 deletions R/rules-other.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ add_brackets_in_pipe <- function(pd) {
lag_newlines = rep(0, 2),
terminal = rep(TRUE, 2),
spaces = rep(0, 2),
line1 = pd$line2[has_no_brackets] + 1:2,
line1 = pd$line1[has_no_brackets],
line2 = line1,
col1 = pd$col1[has_no_brackets],
col2 = col1,
col2 = col1 + 1:2,
indent = rep(0, 2),
child = rep(list(NULL), 2)
)
Expand Down
9 changes: 5 additions & 4 deletions R/rules-spacing.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ set_space_between_levels <- function(pd_flat) {
#' Start comments with a space
#'
#' Forces comments to start with a space, that is, after the regular expression
#' "^#+'*", at least one space must follow. Multiple spaces may be legit for
#' indention in some situations.
#'
#' "^#+'*", at least one space must follow if the comment is *non-empty*, i.e
#' there is not just spaces within the comment. Multiple spaces may be legit
#' for indention in some situations.
#' @param pd A parse table.
#' @param force_one Wheter or not to force one space or allow multiple spaces
#' after the regex "^#+'*".
Expand All @@ -156,7 +156,8 @@ start_comments_with_space <- function(pd, force_one = FALSE) {
regex = "^(#+'*)( *)(.*)$")
comments$space_after_prefix <- nchar(comments$space_after_prefix)
comments$space_after_prefix <- set_spaces(
comments$space_after_prefix,
spaces_after_prefix = comments$space_after_prefix,
text_length = nchar(trimws(comments$text, "right")),
force_one
)

Expand Down
19 changes: 19 additions & 0 deletions R/serialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,22 @@ serialize_parse_data_flat <- function(pd_flat) {
.[[1L]]
}

#' Serialize flattened parse data
#'
#' Collapses a flattened parse table into character vector representation.
#' @param flattened_pd A flattened parse table.
serialize_parse_data_flattened <- function(flattened_pd) {
flattened_pd$lag_newlines[1] <- flattened_pd$line1[1] - 1
flattened_pd %>%
summarize_(
text_ws = ~paste0(
map(lag_newlines, add_newlines),
map(lag_spaces, add_spaces),
text,
collapse = "")) %>%
.[["text_ws"]] %>%
strsplit("\n", fixed = TRUE) %>%
.[[1L]]


}
7 changes: 2 additions & 5 deletions R/serialized_tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,8 @@ style_indent_curly_round <- function(text) {
NULL
)

text %>%
compute_parse_data_nested() %>%
apply_transformers(transformers) %>%
serialize_parse_data_nested()

transformed_text <- parse_transform_serialize(text, transformers)
transformed_text
}

#' @describeIn test_transformer Transformations for indention based on operators
Expand Down
15 changes: 12 additions & 3 deletions R/transform.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ transform_files <- function(files, transformers, flat) {
}
invisible(changed)
}

#' Closure to return a transformer function
#'
#' This function takes a list of transformer functions as input and
Expand Down Expand Up @@ -86,7 +85,10 @@ parse_transform_serialize <- function(text, transformers) {
pd_nested <- compute_parse_data_nested(text)
transformed_pd <- apply_transformers(pd_nested, transformers)
# TODO verify_roundtrip
serialized_transformed_text <- serialize_parse_data_nested(transformed_pd)
flattened_pd <- post_visit(transformed_pd, list(extract_terminals)) %>%
enrich_terminals()

serialized_transformed_text <- serialize_parse_data_flattened(flattened_pd)
serialized_transformed_text
}

Expand All @@ -113,5 +115,12 @@ apply_transformers <- function(pd_nested, transformers) {

transformed_all <- pre_visit(transformed_updated_multi_line,
c(transformers$space, transformers$token))
transformed_all

transformed_absolute_indent <- context_to_terminals(transformed_all,
outer_lag_newlines = 0,
outer_indent = 0,
outer_spaces = 0)

transformed_absolute_indent

}
124 changes: 124 additions & 0 deletions R/visit.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,127 @@ visit_one <- function(pd_flat, funs) {
reduce(funs, function(x, fun) fun(x),
.init = pd_flat)
}


#' Propagate context to terminals
#'
#' Implements a very specific pre-visiting scheme, namely to propagate
#' indention, spaces and lag_newlines to inner token to terminals. This means
#' that information regarding indention, linebreaks and spaces (which is
#' relative in `pd_nested`) will be converted into absolute.
#' @inherit context_towards_terminals
#' @seealso context_towards_terminals visitors
context_to_terminals <- function(pd_nested,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do we need to change to use this with pre_visit() instead of a "manual" recursive call?

Copy link
Collaborator Author

@lorenzwalthert lorenzwalthert Aug 4, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that but I could not find an easy way to do that. In contrast to pre_visit(), we pass scalars from one level to the other, not functions. Also, unlike in pre_visit(), we do not map over the children only, but simultaneously over other columns too (pmap() instead of map()). You could probably create a function that can accommodate both (a pmap visitor and for the usual case, we just have p = 1), but I felt it would not make things clearer since the tasks are not very similar in their implementation.

Copy link
Member

@krlmlr krlmlr Aug 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about:

visit_context_to_terminals <- function(pd) {
  pd <- context_towards_terminals(pd, pd$outer_lag_newlines, ...)
  pd$child <- map2(pd$child, ..., function(x, y) { 
    x[["outer_lag_newlines"]] <- y
  })
  ...
  pd
}

This stores the information for the next stage of the visitor in the children, where it is then picked up as needed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this:

context_to_terminals <- function(pd_nested) {

  if (is.null(pd_nested)) return()

  pd_transformed <- context_towards_terminals(
    pd_nested,
    pd_nested$outer_lag_newlines,
    pd_nested$outer_indent,
    pd_nested$outer_spaces
  )

  pd_transformed$child <- pmap(
    list(
      pd_transformed$child,
      pd_transformed$lag_newlines,
      pd_transformed$indent,
      pd_transformed$spaces),
    function(child, lag_newlines, indent, spaces) {
      if (is.null(child)) return(NULL)
      child[1, "outer_lag_newlines"] <- lag_newlines
      child[["outer_indent"]] <- indent
      child[nrow(child), "outer_spaces"] <- spaces
      child
    })
  pd_transformed
}

And changed context_towards_terminals to

context_towards_terminals <- function(pd_nested,
                                      outer_lag_newlines,
                                      outer_indent,
                                      outer_spaces) {
  pd_nested$indent <- pd_nested$indent + outer_indent
  pd_nested$lag_newlines <- pd_nested$lag_newlines + outer_lag_newlines
  pd_nested$spaces <-
    pd_nested$spaces + outer_spaces
  pd_nested
}

Which could probably done more elegantly with a map_if().
In addition, we need to initialise the new columns in create_filler

pd_flat$outer_lag_newlines <- 0
  pd_flat$outer_indent <- 0
  pd_flat$outer_spaces <- 0

And in all functions that create new tokens such as add_brackets_in_pipe.
All tests pass.
What do you think? I think initial version was more clear, did not need three new columns and had less (almost-)code duplication. Maybe there is some more simplification I missed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would only add these columns temporarily in this visitor, and then remove them from the final result.

Anyway, I thought we should keep using visitors instead of recursive calls, but if the code is more difficult to understand with the visitor, let's keep the recursive calls.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another advantage of using visitors though is that we can allow the user to take control over those transformations if the transformers are passed via the transformers argument. Maybe we can change it later if we consider this to be a manipulation the user should control.

outer_lag_newlines,
outer_indent,
outer_spaces) {

if (is.null(pd_nested)) return()

pd_transformed <- context_towards_terminals(
pd_nested, outer_lag_newlines, outer_indent, outer_spaces
)

pd_transformed$child <- pmap(list(pd_transformed$child,
pd_transformed$lag_newlines,
pd_transformed$indent,
pd_transformed$spaces),
context_to_terminals)
pd_transformed
}


#' Update the a parse table given outer context
#'
#' `outer_lag_newlines` are added to the first token in `pd`,
#' `outer_indent` is added to all tokens in `pd`, `outer_spaces` is added to
#' the last token in `pd`. [context_to_terminals()] calls this function
#' repeatedly, which means the propagation of the parse information to the
#' terminal tokens.
#' @param pd_nested A nested parse table.
#' @param outer_lag_newlines The lag_newlines to be propagated inwards.
#' @param outer_indent The indention depth to be propagated inwards.
#' @param outer_spaces The number of spaces to be propagated inwards.
#' @return An updated parse table.
#' @seealso context_to_terminals
context_towards_terminals <- function(pd_nested,
outer_lag_newlines,
outer_indent,
outer_spaces) {
pd_nested$indent <- pd_nested$indent + outer_indent
pd_nested$lag_newlines[1] <- pd_nested$lag_newlines[1] + outer_lag_newlines
pd_nested$spaces[nrow(pd_nested)] <-
pd_nested$spaces[nrow(pd_nested)] + outer_spaces
pd_nested
}

#' Extract terminal tokens
#'
#' Turns a nested parse table into a flat parse table and extracts *all*
#' attributes
#' @param pd_nested A nested parse table.
extract_terminals <- function(pd_nested) {
if (is.null(pd_nested)) return(pd)
pd_split <- split(pd_nested, seq_len(nrow(pd_nested)))
bind_rows(ifelse(pd_nested$terminal, pd_split, pd_nested$child))
}


#' Enrich flattened parse table
#'
#' Enriches a flattened parse table with terminals only. In particular, it is
#' possible to compute the exact position a token will have (line and column)
#' when it will be serialized.
#' @details Since we have only terminal tokens now, the line on which a token
#' starts we also be the line on which it ends. We call `line1` the line on
#' which the token starts. `line1` has the same meaning as `line1` that can be
#' found in a flat parse table (see [tokenize()]), just that the `line1`
#' created by `enrich_terminals()` is the updated version of the former
#' `line1`. The same applies for `col1` and `col2`.
#' @inheritParams choose_indention
enrich_terminals <- function(flattened_pd, use_raw_indention = FALSE) {
flattened_pd$lag_spaces <- lag(flattened_pd$spaces, default = 0L)
flattened_pd <- choose_indention(flattened_pd, use_raw_indention)
flattened_pd$line1 <-
cumsum(flattened_pd$lag_newlines) + flattened_pd$line1[1]

flattened_pd$newlines <- lead(flattened_pd$lag_newlines, default = 0L)
flattened_pd$nchar <- nchar(flattened_pd$text, type = "width")
flattened_pd <- flattened_pd %>%
group_by(line1) %>%
mutate(col2 = cumsum(nchar + lag_spaces)) %>%
ungroup()
flattened_pd$col1 <- flattened_pd$col2 - flattened_pd$nchar
flattened_pd
}

#' Choose the indention method for the tokens
#'
#' Either use the raw indention, which is just the spaces computed between
#' the first token on a new line and the token before it, or use the indention
#' computed according to the transformer used, which is stored in the column
#' `indention`.
#'
#' All indention information will be combined with the space information for
#' the first token on a new line.
#' If `use_raw_indention` is set, information in the column `indention` will
#' be discarded anyways. If it is not set, the first token on a new line will
#' "inherit" the indention of the whole line.
#' The column `indention` will be removed since all information necessary is
#' containted in the spacing information of the first token on a new line and
#' the position of the tokens will not be changed anymore at this stage.
#' @param flattened_pd A nested parse table that was turned into a flat parse
#' table using [extract_terminals()].
#' @param use_raw_indention Boolean indicating wheter or not the raw indention
#' should be used.
choose_indention <- function(flattened_pd, use_raw_indention) {
if (!use_raw_indention) {
flattened_pd$lag_spaces <- ifelse(flattened_pd$lag_newlines > 0,
flattened_pd$indent,
flattened_pd$lag_spaces)
}
flattened_pd$indent <- NULL
flattened_pd
}


31 changes: 31 additions & 0 deletions man/choose_indention.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/context_to_terminals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions man/context_towards_terminals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/enrich_terminals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading