-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathpretty_dataframe_definitions.R
334 lines (303 loc) · 17.9 KB
/
pretty_dataframe_definitions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
create_console_dataframe_for_hardcoding <- function(dataframe,
round_significant_digits = 7,
rows_all_identical_width = T,
hard_code_factor_levels = F) {
#' Function to help hard code test fixtures more quickly when documenting function outputs. This
#' generates a command to define the provided dataframe and logs it in the console for quick copy-
#' paste transfer into a test expectation.
#'
#' @details Limitations or possible issues: no ordering of factor levels is included (you may have to
#' add that to the output post hoc if all.equal comparison says "Component "levels"" differs
#' in some capacity.)
#'
#' @param dataframe <data.frame> Dataframe to generate
#' @param round_significant_digits <integer> How many significant digits to preserve for numeric values
#' @param rows_all_identical_width <logical> Flag whether to use maximum character count of single longest
#' string representation as the field width throughout the whole table,
#' or whether to use the maximum count by row to save a little horizontal
#' space.
#' @param hard_code_factor_levels <logical> Flag whether to hard code the factor level names and order
#'
#' @return Nothing - dataframe generation string is logged in the console with `cat`
#'
#' @example We can create a sample dataframe with many classes of data (numeric, factor, character, logical)
#' and some missing entries (NA).
#' sample_dataframe = iris[c(1, 51, 79, 101),]
#' sample_dataframe$logic_flag = c(NA, T, F, T)
#' sample_dataframe$character = c("a","b","unknown","c")
#' sample_dataframe[2, 1] = NA
#' # Dataframe looks like:
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species logic_flag character
#' # 1 5.1 3.5 1.4 0.2 setosa NA a
#' # 51 NA 3.2 4.7 1.4 versicolor TRUE b
#' # 79 6.0 2.9 4.5 1.5 versicolor FALSE unknown
#' # 101 6.3 3.3 6.0 2.5 virginica TRUE c
#' # Let's run the method on it:
#' create_console_dataframe_for_hardcoding(sample_dataframe,
#' rows_all_identical_width = T,
#' hard_code_factor_levels = T)
#' # Result in console:
#' data.frame(Sepal.Length = c( 5.1, NA, 6, 6.3),
#' Sepal.Width = c( 3.5, 3.2, 2.9, 3.3),
#' Petal.Length = c( 1.4, 4.7, 4.5, 6),
#' Petal.Width = c( 0.2, 1.4, 1.5, 2.5),
#' Species = factor(c( "setosa", "versicolor", "versicolor", "virginica"),
#' levels = c("setosa", "versicolor", "virginica")),
#' logic_flag = c( NA, TRUE, FALSE, TRUE),
#' character = c( "a", "b", "unknown", "c"),
#' stringsAsFactors = FALSE,
#' row.names = c("1", "51", "79", "101"))
#' # Note that the verticals (rows in finished dataframe) are sometimes wider than they need to be.
#' # To make them as narrow as possible while preserving alignment, set `rows_all_identical_width = F`.
#' # If factor levels do not need to be hardcoded, set `hard_code_factor_levels = F`.
#' # For comparison:
#' create_console_dataframe_for_hardcoding(sample_dataframe,
#' rows_all_identical_width = F,
#' hard_code_factor_levels = F)
#' # Result in console:
#' data.frame(Sepal.Length = c( 5.1, NA, 6, 6.3),
#' Sepal.Width = c( 3.5, 3.2, 2.9, 3.3),
#' Petal.Length = c( 1.4, 4.7, 4.5, 6),
#' Petal.Width = c( 0.2, 1.4, 1.5, 2.5),
#' Species = factor(c("setosa", "versicolor", "versicolor", "virginica")),
#' logic_flag = c( NA, TRUE, FALSE, TRUE),
#' character = c( "a", "b", "unknown", "c"),
#' stringsAsFactors = FALSE,
#' row.names = c("1", "51", "79", "101"))
# Column names and types
columns = colnames(dataframe)
column_classes = sapply(dataframe, class)
column_name_field_size = max(nchar(columns))
contains_factor = ("factor" %in% column_classes)
contains_character = ("character" %in% column_classes)
contains_integer = ("integer" %in% column_classes)
# Row names
rows = rownames(dataframe)
# Data fields get rounded for numeric, then find maximum size of the field for alignment
numeric_columns = columns[sapply(dataframe, is.numeric)]
dataframe[numeric_columns] = signif(dataframe[numeric_columns],
round_significant_digits)
# Get data field widths
data_field_size = get_data_field_size(dataframe,
columns,
column_classes,
rows_all_identical_width)
# Start building string representation
string_representation = c("data.frame(")
for (column_name in columns) {
col_definition = sprintf(paste0("%s%",
column_name_field_size - nchar(column_name),
"s = %s,\n"),
column_name,
"",
create_column_entry_string(dataframe[[column_name]],
column_classes[[column_name]],
data_field_size,
contains_factor,
contains_integer,
contains_character,
hard_code_factor_levels))
string_representation = paste0(string_representation,
col_definition)
}
string_representation = paste0(string_representation,
"stringsAsFactors = ",
!contains_character)
if (! all(rows == as.character(1:nrow(dataframe)))) {
string_representation = paste0(string_representation,
",\nrow.names = c(",
paste0('"',
rows,
'"',
collapse = ", "),
"))")
} else {
string_representation = paste0(string_representation,
")")
}
cat(string_representation)
}
get_data_field_size <- function(dataframe,
columns,
column_classes,
rows_all_identical_width) {
#' Calculation of how many characters are needed to accommodate the largest entry in the dataframe to
#' ensure alignment.
#'
#' @param dataframe <data.frame> Dataframe to generate
#' @param columns <character vector> Names of dataframe columns
#' @param column_classes <character vector> Classes of dataframe columns
#' @param rows_all_identical_width <logical> Flag whether to use maximum character count of single longest
#' string representation as the field width throughout the whole table,
#' or whether to use the maximum count by row to save a little horizontal
#' space.
#'
#' @return `data_field_size`, either a single integer to set the width of every table entry or a vector of
#' integers length matching the number of dataframe rows.
#'
#' @example We can create a sample dataframe with many classes of data (numeric, factor, character, logical)
#' and some missing entries (NA).
#' sample_dataframe = iris[c(1, 51, 79, 101),]
#' sample_dataframe$logic_flag = c(NA, T, F, T)
#' sample_dataframe$character = c("a","b","unknown","c")
#' sample_dataframe[2, 1] = NA
#' # Dataframe looks like:
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species logic_flag character
#' # 1 5.1 3.5 1.4 0.2 setosa NA a
#' # 51 NA 3.2 4.7 1.4 versicolor TRUE b
#' # 79 6.0 2.9 4.5 1.5 versicolor FALSE unknown
#' # 101 6.3 3.3 6.0 2.5 virginica TRUE c
#' # Let's run the method on it:
#' get_data_field_size(sample_dataframe,
#' names(sample_dataframe),
#' c("numeric", "numeric", "numeric", "numeric", "factor", "logical", "character"),
#' T)
#' # Result: 12, which corresponds to the number of characters needed to fit the largest entry in the
#' # table (versicolor, which has 10 characters + 2 for quotation marks around for string
#' # format.)
#' # Let's run again with the row width flag toggled to false:
#' get_data_field_size(sample_dataframe,
#' names(sample_dataframe),
#' c("numeric", "numeric", "numeric", "numeric", "factor", "logical", "character"),
#' F)
#' # Result: c(8, 12, 12 11) by row name. This is the number of characters needed to fit the largest
#' entry in the table on a per-row basis. Note that rows a visualized vertically
#' during dataframe definition. These numbers come from the "Species" column +
#' quotation marks.
# Change NA values to a string representation to avoid frustration with character count
dataframe[is.na(dataframe)] = "NA"
character_counts = apply(dataframe, c(1,2), function(y) nchar(y))
# Add 2 extra characters for quotes around strings
column_needs_quotes = column_classes %in% c("factor", "character")
character_counts[,columns[column_needs_quotes]] = character_counts[,columns[column_needs_quotes]] + 2
# Determine whether to return a single value or a vector with a value for each row
if (rows_all_identical_width) {
data_field_size = max(character_counts, na.rm = T)
} else {
data_field_size = apply(character_counts, 1, function(x) max(x, na.rm = T))
}
return(data_field_size)
}
create_column_entry_string <- function(column_contents,
column_class,
field_size,
contains_factor,
contains_integer,
contains_character,
hard_code_factor_levels) {
#' Create a string to define a single column in the dataframe
#'
#' @param column_contents <vector> Values in the column
#' @param column_class <character> Class of the column
#' Can for sure handle numeric, integer, character, factor, logical
#' @param field_size <integer> Number of characters for each element, including whitespace padding.
#' Can be a single value to use everywhere, or a vector to control how much
#' space each element gets on a row-wise bases.
#' @param contains_factor <logical> Whether the dataframe as a whole contains any factor columns
#' @param contains_integer <logical> Whether the dataframe as a whole contains any integer columns
#' @param contains_character <logical> Whether the dataframe as a whole contains any character columns
#' @param hard_code_factor_levels <logical> Flag whether to hard code the factor level names and order
#'
#' @return Vector definition of the column as a string, e.g. "c(10, 3, 34)", with appropriate spacing.
#' See examples below.
#'
#' @example We can create a sample dataframe with many classes of data (numeric, factor, character, logical)
#' and some missing entries (NA).
#' sample_dataframe = iris[c(1, 51, 79, 101),]
#' sample_dataframe$logic_flag = c(NA, T, F, T)
#' sample_dataframe$character = c("a","b","unknown","c")
#' sample_dataframe[2, 1] = NA
#' # Dataframe looks like:
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species logic_flag character
#' # 1 5.1 3.5 1.4 0.2 setosa NA a
#' # 51 NA 3.2 4.7 1.4 versicolor TRUE b
#' # 79 6.0 2.9 4.5 1.5 versicolor FALSE unknown
#' # 101 6.3 3.3 6.0 2.5 virginica TRUE c
#' # Let's get some columns:
#' create_column_entry_string(sample_dataframe$Sepal.Length,
#' "numeric",
#' field_size = 12,
#' contains_factor = T,
#' contains_integer = F,
#' contains_character = T,
#' hard_code_factor_levels = F)
#' # Results: " c( 5.1, NA, 6, 6.3)"
#' # Another column:
#' create_column_entry_string(sample_dataframe$Species,
#' "factor",
#' field_size = 12,
#' contains_factor = T,
#' contains_integer = F,
#' contains_character = T,
#' hard_code_factor_levels = F)
#' # Results: "factor(c( "setosa", "versicolor", "versicolor", "virginica"))"
#' # Another column:
#' create_column_entry_string(sample_dataframe$logic_flag,
#' "logical",
#' field_size = 12,
#' contains_factor = T,
#' contains_integer = F,
#' contains_character = T,
#' hard_code_factor_levels = F)
#' # Results: " c( NA, TRUE, FALSE, TRUE)"
# Determine formatting for each entry
if (length(field_size) == 1) {
field_size = rep(field_size,
length(column_contents))
}
# Single or multi-row?
contains_multiple_rows = (length(column_contents) > 1)
format_setting = paste0(ifelse(contains_multiple_rows,
"c(",
""),
paste0("%",
field_size,
"s",
collapse = ", "),
ifelse(contains_multiple_rows,
")",
""))
if (column_class %in% c("character", "factor")) {
if (column_class == "factor") {
level_description = paste0(",\nlevels = c(",
paste0('"',
levels(column_contents),
'"',
collapse = ", "),
")")
}
na_indexing = is.na(column_contents)
column_contents = paste0('"',
column_contents,
'"')
column_contents[na_indexing] = NA
}
text_definition = do.call(sprintf,
c(fmt = format_setting,
as.list(column_contents)))
left_whitespace_width = max(ifelse(contains_factor & (contains_character | hard_code_factor_levels), 7, 0),
ifelse(contains_integer, 11, 0)) -
ifelse(column_class == "factor" & (contains_character | hard_code_factor_levels), 7, 0) -
ifelse(column_class == "integer", 11, 0)
if (column_class == "factor" & (contains_character | hard_code_factor_levels)) {
# Need to add factor command
text_definition = paste0("factor(",
text_definition,
ifelse(hard_code_factor_levels,
level_description,
""),
")")
} else if (column_class == "integer") {
# Need to add as.integer coercion
text_definition = paste0("as.integer(",
text_definition,
")")
}
# Need to match bracket alignment of factor/integer column
text_definition = sprintf(paste0("%",
left_whitespace_width,
"s%s"),
"",
text_definition)
return(text_definition)
}