Skip to content

Commit b0cf844

Browse files
author
Michael Chirico
committed
Merge branch 'master' into r-stop-paste
2 parents e4b8ca9 + 410fa91 commit b0cf844

File tree

40 files changed

+799
-580
lines changed

40 files changed

+799
-580
lines changed

R/pkg/R/DataFrame.R

Lines changed: 72 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ setMethod("show", "SparkDataFrame",
271271
paste(l, collapse = ":")
272272
})
273273
s <- paste(cols, collapse = ", ")
274-
cat(paste(class(object), "[", s, "]\n", sep = ""))
274+
cat(paste0(class(object), "[", s, "]\n"))
275275
}
276276
})
277277

@@ -1659,9 +1659,7 @@ setMethod("dapplyCollect",
16591659
#'
16601660
#' @param cols grouping columns.
16611661
#' @param func a function to be applied to each group partition specified by grouping
1662-
#' column of the SparkDataFrame. The function \code{func} takes as argument
1663-
#' a key - grouping columns and a data frame - a local R data.frame.
1664-
#' The output of \code{func} is a local R data.frame.
1662+
#' column of the SparkDataFrame. See Details.
16651663
#' @param schema the schema of the resulting SparkDataFrame after the function is applied.
16661664
#' The schema must match to output of \code{func}. It has to be defined for each
16671665
#' output column with preferred output column name and corresponding data type.
@@ -1671,29 +1669,43 @@ setMethod("dapplyCollect",
16711669
#' @aliases gapply,SparkDataFrame-method
16721670
#' @rdname gapply
16731671
#' @name gapply
1672+
#' @details
1673+
#' \code{func} is a function of two arguments. The first, usually named \code{key}
1674+
#' (though this is not enforced) corresponds to the grouping key, will be an
1675+
#' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1676+
#' to the grouping columns' values for the current group.
1677+
#'
1678+
#' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1679+
#' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1680+
#'
1681+
#' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1682+
#' in particular this means the names of the output \code{data.frame} are irrelevant
1683+
#'
16741684
#' @seealso \link{gapplyCollect}
16751685
#' @examples
16761686
#'
16771687
#' \dontrun{
1678-
#' Computes the arithmetic mean of the second column by grouping
1679-
#' on the first and third columns. Output the grouping values and the average.
1688+
#' # Computes the arithmetic mean of the second column by grouping
1689+
#' # on the first and third columns. Output the grouping values and the average.
16801690
#'
16811691
#' df <- createDataFrame (
16821692
#' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
16831693
#' c("a", "b", "c", "d"))
16841694
#'
1685-
#' Here our output contains three columns, the key which is a combination of two
1686-
#' columns with data types integer and string and the mean which is a double.
1695+
#' # Here our output contains three columns, the key which is a combination of two
1696+
#' # columns with data types integer and string and the mean which is a double.
16871697
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
16881698
#' structField("avg", "double"))
16891699
#' result <- gapply(
16901700
#' df,
16911701
#' c("a", "c"),
16921702
#' function(key, x) {
1703+
#' # key will either be list(1L, '1') (for the group where a=1L,c='1') or
1704+
#' # list(3L, '3') (for the group where a=3L,c='3')
16931705
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
16941706
#' }, schema)
16951707
#'
1696-
#' The schema also can be specified in a DDL-formatted string.
1708+
#' # The schema also can be specified in a DDL-formatted string.
16971709
#' schema <- "a INT, c STRING, avg DOUBLE"
16981710
#' result <- gapply(
16991711
#' df,
@@ -1702,8 +1714,8 @@ setMethod("dapplyCollect",
17021714
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
17031715
#' }, schema)
17041716
#'
1705-
#' We can also group the data and afterwards call gapply on GroupedData.
1706-
#' For Example:
1717+
#' # We can also group the data and afterwards call gapply on GroupedData.
1718+
#' # For example:
17071719
#' gdf <- group_by(df, "a", "c")
17081720
#' result <- gapply(
17091721
#' gdf,
@@ -1712,15 +1724,15 @@ setMethod("dapplyCollect",
17121724
#' }, schema)
17131725
#' collect(result)
17141726
#'
1715-
#' Result
1716-
#' ------
1717-
#' a c avg
1718-
#' 3 3 3.0
1719-
#' 1 1 1.5
1727+
#' # Result
1728+
#' # ------
1729+
#' # a c avg
1730+
#' # 3 3 3.0
1731+
#' # 1 1 1.5
17201732
#'
1721-
#' Fits linear models on iris dataset by grouping on the 'Species' column and
1722-
#' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1723-
#' and 'Petal_Width' as training features.
1733+
#' # Fits linear models on iris dataset by grouping on the 'Species' column and
1734+
#' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1735+
#' # and 'Petal_Width' as training features.
17241736
#'
17251737
#' df <- createDataFrame (iris)
17261738
#' schema <- structType(structField("(Intercept)", "double"),
@@ -1736,12 +1748,12 @@ setMethod("dapplyCollect",
17361748
#' }, schema)
17371749
#' collect(df1)
17381750
#'
1739-
#' Result
1740-
#' ---------
1741-
#' Model (Intercept) Sepal_Width Petal_Length Petal_Width
1742-
#' 1 0.699883 0.3303370 0.9455356 -0.1697527
1743-
#' 2 1.895540 0.3868576 0.9083370 -0.6792238
1744-
#' 3 2.351890 0.6548350 0.2375602 0.2521257
1751+
#' # Result
1752+
#' # ---------
1753+
#' # Model (Intercept) Sepal_Width Petal_Length Petal_Width
1754+
#' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1755+
#' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1756+
#' # 3 2.351890 0.6548350 0.2375602 0.2521257
17451757
#'
17461758
#'}
17471759
#' @note gapply(SparkDataFrame) since 2.0.0
@@ -1759,20 +1771,30 @@ setMethod("gapply",
17591771
#'
17601772
#' @param cols grouping columns.
17611773
#' @param func a function to be applied to each group partition specified by grouping
1762-
#' column of the SparkDataFrame. The function \code{func} takes as argument
1763-
#' a key - grouping columns and a data frame - a local R data.frame.
1764-
#' The output of \code{func} is a local R data.frame.
1774+
#' column of the SparkDataFrame. See Details.
17651775
#' @return A data.frame.
17661776
#' @family SparkDataFrame functions
17671777
#' @aliases gapplyCollect,SparkDataFrame-method
17681778
#' @rdname gapplyCollect
17691779
#' @name gapplyCollect
1780+
#' @details
1781+
#' \code{func} is a function of two arguments. The first, usually named \code{key}
1782+
#' (though this is not enforced) corresponds to the grouping key, will be an
1783+
#' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1784+
#' to the grouping columns' values for the current group.
1785+
#'
1786+
#' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1787+
#' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1788+
#'
1789+
#' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1790+
#' in particular this means the names of the output \code{data.frame} are irrelevant
1791+
#'
17701792
#' @seealso \link{gapply}
17711793
#' @examples
17721794
#'
17731795
#' \dontrun{
1774-
#' Computes the arithmetic mean of the second column by grouping
1775-
#' on the first and third columns. Output the grouping values and the average.
1796+
#' # Computes the arithmetic mean of the second column by grouping
1797+
#' # on the first and third columns. Output the grouping values and the average.
17761798
#'
17771799
#' df <- createDataFrame (
17781800
#' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
@@ -1787,8 +1809,8 @@ setMethod("gapply",
17871809
#' y
17881810
#' })
17891811
#'
1790-
#' We can also group the data and afterwards call gapply on GroupedData.
1791-
#' For Example:
1812+
#' # We can also group the data and afterwards call gapply on GroupedData.
1813+
#' # For example:
17921814
#' gdf <- group_by(df, "a", "c")
17931815
#' result <- gapplyCollect(
17941816
#' gdf,
@@ -1798,15 +1820,15 @@ setMethod("gapply",
17981820
#' y
17991821
#' })
18001822
#'
1801-
#' Result
1802-
#' ------
1803-
#' key_a key_c mean_b
1804-
#' 3 3 3.0
1805-
#' 1 1 1.5
1823+
#' # Result
1824+
#' # ------
1825+
#' # key_a key_c mean_b
1826+
#' # 3 3 3.0
1827+
#' # 1 1 1.5
18061828
#'
1807-
#' Fits linear models on iris dataset by grouping on the 'Species' column and
1808-
#' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1809-
#' and 'Petal_Width' as training features.
1829+
#' # Fits linear models on iris dataset by grouping on the 'Species' column and
1830+
#' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1831+
#' # and 'Petal_Width' as training features.
18101832
#'
18111833
#' df <- createDataFrame (iris)
18121834
#' result <- gapplyCollect(
@@ -1818,12 +1840,12 @@ setMethod("gapply",
18181840
#' data.frame(t(coef(m)))
18191841
#' })
18201842
#'
1821-
#' Result
1822-
#'---------
1823-
#' Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1824-
#' 1 0.699883 0.3303370 0.9455356 -0.1697527
1825-
#' 2 1.895540 0.3868576 0.9083370 -0.6792238
1826-
#' 3 2.351890 0.6548350 0.2375602 0.2521257
1843+
#' # Result
1844+
#' # ---------
1845+
#' # Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1846+
#' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1847+
#' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1848+
#' # 3 2.351890 0.6548350 0.2375602 0.2521257
18271849
#'
18281850
#'}
18291851
#' @note gapplyCollect(SparkDataFrame) since 2.0.0
@@ -2735,10 +2757,10 @@ setMethod("merge",
27352757
colY <- joinY[[i]]
27362758

27372759
if (colX %in% by) {
2738-
colX <- paste(colX, suffixes[1], sep = "")
2760+
colX <- paste0(colX, suffixes[1])
27392761
}
27402762
if (colY %in% by) {
2741-
colY <- paste(colY, suffixes[2], sep = "")
2763+
colY <- paste0(colY, suffixes[2])
27422764
}
27432765

27442766
colX <- getColumn(xsel, colX)
@@ -2753,7 +2775,7 @@ setMethod("merge",
27532775

27542776
# sorts the result by 'by' columns if sort = TRUE
27552777
if (sort && length(by) > 0) {
2756-
colNameWithSuffix <- paste(by, suffixes[2], sep = "")
2778+
colNameWithSuffix <- paste0(by, suffixes[2])
27572779
joinRes <- do.call("arrange", c(joinRes, colNameWithSuffix, decreasing = FALSE))
27582780
}
27592781

@@ -2776,7 +2798,7 @@ genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) {
27762798
cols <- lapply(allColNames, function(colName) {
27772799
col <- getColumn(x, colName)
27782800
if (colName %in% intersectedColNames) {
2779-
newJoin <- paste(colName, suffix, sep = "")
2801+
newJoin <- paste0(colName, suffix)
27802802
if (newJoin %in% allColNames) {
27812803
stop("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.",
27822804
"Please use different suffixes for the intersected columns.")

R/pkg/R/RDD.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
6969

7070
setMethod("showRDD", "RDD",
7171
function(object) {
72-
cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep = ""))
72+
cat(paste0(callJMethod(getJRDD(object), "toString"), "\n"))
7373
})
7474

7575
setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {

R/pkg/R/functions.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3951,7 +3951,6 @@ setMethod("map_values",
39513951
#' @rdname column_collection_functions
39523952
#' @aliases map_zip_with map_zip_with,characterOrColumn,characterOrColumn,function-method
39533953
#'
3954-
#' @examples
39553954
#' @note map_zip_with since 3.1.0
39563955
setMethod("map_zip_with",
39573956
signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),

R/pkg/R/schema.R

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,9 @@ print.structType <- function(x, ...) {
9999
cat("StructType\n",
100100
sapply(x$fields(),
101101
function(field) {
102-
paste("|-", "name = \"", field$name(),
103-
"\", type = \"", field$dataType.toString(),
104-
"\", nullable = ", field$nullable(), "\n",
105-
sep = "")
102+
paste0("|-", "name = \"", field$name(),
103+
"\", type = \"", field$dataType.toString(),
104+
"\", nullable = ", field$nullable(), "\n")
106105
}),
107106
sep = "")
108107
}

R/pkg/R/sparkR.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ sparkR.sparkContext <- function(
244244
uriSep <- "////"
245245
}
246246
localJarPaths <- lapply(jars,
247-
function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
247+
function(j) { utils::URLencode(paste0("file:", uriSep, j)) })
248248

249249
# Set the start time to identify jobjs
250250
# Seconds resolution is good enough for this purpose, so use ints

R/pkg/R/types.R

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,27 +94,22 @@ checkSchemaInArrow <- function(schema) {
9494
}
9595

9696
# Both cases below produce a corrupt value for unknown reason. It needs to be investigated.
97-
if (any(sapply(schema$fields(), function(x) x$dataType.toString() == "FloatType"))) {
97+
field_strings <- sapply(schema$fields(), function(x) x$dataType.toString())
98+
if (any(field_strings == "FloatType")) {
9899
stop("Arrow optimization in R does not support float type yet.")
99100
}
100-
if (any(sapply(schema$fields(), function(x) x$dataType.toString() == "BinaryType"))) {
101+
if (any(field_strings == "BinaryType")) {
101102
stop("Arrow optimization in R does not support binary type yet.")
102103
}
103-
if (any(sapply(schema$fields(),
104-
function(x) startsWith(x$dataType.toString(),
105-
"ArrayType")))) {
104+
if (any(startsWith(field_strings, "ArrayType"))) {
106105
stop("Arrow optimization in R does not support array type yet.")
107106
}
108107

109108
# Arrow optimization in Spark does not yet support both cases below.
110-
if (any(sapply(schema$fields(),
111-
function(x) startsWith(x$dataType.toString(),
112-
"StructType")))) {
109+
if (any(startsWith(field_strings, "StructType"))) {
113110
stop("Arrow optimization in R does not support nested struct type yet.")
114111
}
115-
if (any(sapply(schema$fields(),
116-
function(x) startsWith(x$dataType.toString(),
117-
"MapType")))) {
112+
if (any(startsWith(field_strings, "MapType"))) {
118113
stop("Arrow optimization in R does not support map type yet.")
119114
}
120115
}

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2593,8 +2593,8 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
25932593
writeLines(mockLines3, jsonPath3)
25942594
df3 <- read.json(jsonPath3)
25952595
expect_error(merge(df, df3),
2596-
paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
2597-
"Please use different suffixes for the intersected columns.", sep = ""))
2596+
paste0("The following column name: name_y occurs more than once in the 'DataFrame'.",
2597+
"Please use different suffixes for the intersected columns."))
25982598

25992599
unlink(jsonPath2)
26002600
unlink(jsonPath3)
@@ -2637,20 +2637,20 @@ test_that("toJSON() on DataFrame", {
26372637

26382638
test_that("showDF()", {
26392639
df <- read.json(jsonPath)
2640-
expected <- paste("+----+-------+\n",
2641-
"| age| name|\n",
2642-
"+----+-------+\n",
2643-
"|null|Michael|\n",
2644-
"| 30| Andy|\n",
2645-
"| 19| Justin|\n",
2646-
"+----+-------+\n", sep = "")
2647-
expected2 <- paste("+---+----+\n",
2648-
"|age|name|\n",
2649-
"+---+----+\n",
2650-
"|nul| Mic|\n",
2651-
"| 30| And|\n",
2652-
"| 19| Jus|\n",
2653-
"+---+----+\n", sep = "")
2640+
expected <- paste("+----+-------+",
2641+
"| age| name|",
2642+
"+----+-------+",
2643+
"|null|Michael|",
2644+
"| 30| Andy|",
2645+
"| 19| Justin|",
2646+
"+----+-------+\n", sep = "\n")
2647+
expected2 <- paste("+---+----+",
2648+
"|age|name|",
2649+
"+---+----+",
2650+
"|nul| Mic|",
2651+
"| 30| And|",
2652+
"| 19| Jus|",
2653+
"+---+----+\n", sep = "\n")
26542654
expect_output(showDF(df), expected)
26552655
expect_output(showDF(df, truncate = 3), expected2)
26562656
})

dev/deps/spark-deps-hadoop-2.7-hive-1.2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,9 @@ objenesis/2.5.1//objenesis-2.5.1.jar
160160
okhttp/3.12.6//okhttp-3.12.6.jar
161161
okio/1.15.0//okio-1.15.0.jar
162162
opencsv/2.3//opencsv-2.3.jar
163-
orc-core/1.5.9/nohive/orc-core-1.5.9-nohive.jar
164-
orc-mapreduce/1.5.9/nohive/orc-mapreduce-1.5.9-nohive.jar
165-
orc-shims/1.5.9//orc-shims-1.5.9.jar
163+
orc-core/1.5.10/nohive/orc-core-1.5.10-nohive.jar
164+
orc-mapreduce/1.5.10/nohive/orc-mapreduce-1.5.10-nohive.jar
165+
orc-shims/1.5.10//orc-shims-1.5.10.jar
166166
oro/2.0.8//oro-2.0.8.jar
167167
osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
168168
paranamer/2.8//paranamer-2.8.jar

dev/deps/spark-deps-hadoop-2.7-hive-2.3

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,9 @@ objenesis/2.5.1//objenesis-2.5.1.jar
175175
okhttp/3.12.6//okhttp-3.12.6.jar
176176
okio/1.15.0//okio-1.15.0.jar
177177
opencsv/2.3//opencsv-2.3.jar
178-
orc-core/1.5.9//orc-core-1.5.9.jar
179-
orc-mapreduce/1.5.9//orc-mapreduce-1.5.9.jar
180-
orc-shims/1.5.9//orc-shims-1.5.9.jar
178+
orc-core/1.5.10//orc-core-1.5.10.jar
179+
orc-mapreduce/1.5.10//orc-mapreduce-1.5.10.jar
180+
orc-shims/1.5.10//orc-shims-1.5.10.jar
181181
oro/2.0.8//oro-2.0.8.jar
182182
osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
183183
paranamer/2.8//paranamer-2.8.jar

0 commit comments

Comments
 (0)