@@ -271,7 +271,7 @@ setMethod("show", "SparkDataFrame",
271
271
paste(l , collapse = " :" )
272
272
})
273
273
s <- paste(cols , collapse = " , " )
274
- cat(paste (class(object ), " [" , s , " ]\n " , sep = " " ))
274
+ cat(paste0 (class(object ), " [" , s , " ]\n " ))
275
275
}
276
276
})
277
277
@@ -1659,9 +1659,7 @@ setMethod("dapplyCollect",
1659
1659
# '
1660
1660
# ' @param cols grouping columns.
1661
1661
# ' @param func a function to be applied to each group partition specified by grouping
1662
- # ' column of the SparkDataFrame. The function \code{func} takes as argument
1663
- # ' a key - grouping columns and a data frame - a local R data.frame.
1664
- # ' The output of \code{func} is a local R data.frame.
1662
+ # ' column of the SparkDataFrame. See Details.
1665
1663
# ' @param schema the schema of the resulting SparkDataFrame after the function is applied.
1666
1664
# ' The schema must match to output of \code{func}. It has to be defined for each
1667
1665
# ' output column with preferred output column name and corresponding data type.
@@ -1671,29 +1669,43 @@ setMethod("dapplyCollect",
1671
1669
# ' @aliases gapply,SparkDataFrame-method
1672
1670
# ' @rdname gapply
1673
1671
# ' @name gapply
1672
+ # ' @details
1673
+ # ' \code{func} is a function of two arguments. The first, usually named \code{key}
1674
+ # ' (though this is not enforced) corresponds to the grouping key, will be an
1675
+ # ' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1676
+ # ' to the grouping columns' values for the current group.
1677
+ # '
1678
+ # ' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1679
+ # ' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1680
+ # '
1681
+ # ' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1682
+ # ' in particular this means the names of the output \code{data.frame} are irrelevant
1683
+ # '
1674
1684
# ' @seealso \link{gapplyCollect}
1675
1685
# ' @examples
1676
1686
# '
1677
1687
# ' \dontrun{
1678
- # ' Computes the arithmetic mean of the second column by grouping
1679
- # ' on the first and third columns. Output the grouping values and the average.
1688
+ # ' # Computes the arithmetic mean of the second column by grouping
1689
+ # ' # on the first and third columns. Output the grouping values and the average.
1680
1690
# '
1681
1691
# ' df <- createDataFrame (
1682
1692
# ' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
1683
1693
# ' c("a", "b", "c", "d"))
1684
1694
# '
1685
- # ' Here our output contains three columns, the key which is a combination of two
1686
- # ' columns with data types integer and string and the mean which is a double.
1695
+ # ' # Here our output contains three columns, the key which is a combination of two
1696
+ # ' # columns with data types integer and string and the mean which is a double.
1687
1697
# ' schema <- structType(structField("a", "integer"), structField("c", "string"),
1688
1698
# ' structField("avg", "double"))
1689
1699
# ' result <- gapply(
1690
1700
# ' df,
1691
1701
# ' c("a", "c"),
1692
1702
# ' function(key, x) {
1703
+ # ' # key will either be list(1L, '1') (for the group where a=1L,c='1') or
1704
+ # ' # list(3L, '3') (for the group where a=3L,c='3')
1693
1705
# ' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
1694
1706
# ' }, schema)
1695
1707
# '
1696
- # ' The schema also can be specified in a DDL-formatted string.
1708
+ # ' # The schema also can be specified in a DDL-formatted string.
1697
1709
# ' schema <- "a INT, c STRING, avg DOUBLE"
1698
1710
# ' result <- gapply(
1699
1711
# ' df,
@@ -1702,8 +1714,8 @@ setMethod("dapplyCollect",
1702
1714
# ' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
1703
1715
# ' }, schema)
1704
1716
# '
1705
- # ' We can also group the data and afterwards call gapply on GroupedData.
1706
- # ' For Example :
1717
+ # ' # We can also group the data and afterwards call gapply on GroupedData.
1718
+ # ' # For example :
1707
1719
# ' gdf <- group_by(df, "a", "c")
1708
1720
# ' result <- gapply(
1709
1721
# ' gdf,
@@ -1712,15 +1724,15 @@ setMethod("dapplyCollect",
1712
1724
# ' }, schema)
1713
1725
# ' collect(result)
1714
1726
# '
1715
- # ' Result
1716
- # ' ------
1717
- # ' a c avg
1718
- # ' 3 3 3.0
1719
- # ' 1 1 1.5
1727
+ # ' # Result
1728
+ # ' # ------
1729
+ # ' # a c avg
1730
+ # ' # 3 3 3.0
1731
+ # ' # 1 1 1.5
1720
1732
# '
1721
- # ' Fits linear models on iris dataset by grouping on the 'Species' column and
1722
- # ' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1723
- # ' and 'Petal_Width' as training features.
1733
+ # ' # Fits linear models on iris dataset by grouping on the 'Species' column and
1734
+ # ' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1735
+ # ' # and 'Petal_Width' as training features.
1724
1736
# '
1725
1737
# ' df <- createDataFrame (iris)
1726
1738
# ' schema <- structType(structField("(Intercept)", "double"),
@@ -1736,12 +1748,12 @@ setMethod("dapplyCollect",
1736
1748
# ' }, schema)
1737
1749
# ' collect(df1)
1738
1750
# '
1739
- # ' Result
1740
- # ' ---------
1741
- # ' Model (Intercept) Sepal_Width Petal_Length Petal_Width
1742
- # ' 1 0.699883 0.3303370 0.9455356 -0.1697527
1743
- # ' 2 1.895540 0.3868576 0.9083370 -0.6792238
1744
- # ' 3 2.351890 0.6548350 0.2375602 0.2521257
1751
+ # ' # Result
1752
+ # ' # ---------
1753
+ # ' # Model (Intercept) Sepal_Width Petal_Length Petal_Width
1754
+ # ' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1755
+ # ' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1756
+ # ' # 3 2.351890 0.6548350 0.2375602 0.2521257
1745
1757
# '
1746
1758
# '}
1747
1759
# ' @note gapply(SparkDataFrame) since 2.0.0
@@ -1759,20 +1771,30 @@ setMethod("gapply",
1759
1771
# '
1760
1772
# ' @param cols grouping columns.
1761
1773
# ' @param func a function to be applied to each group partition specified by grouping
1762
- # ' column of the SparkDataFrame. The function \code{func} takes as argument
1763
- # ' a key - grouping columns and a data frame - a local R data.frame.
1764
- # ' The output of \code{func} is a local R data.frame.
1774
+ # ' column of the SparkDataFrame. See Details.
1765
1775
# ' @return A data.frame.
1766
1776
# ' @family SparkDataFrame functions
1767
1777
# ' @aliases gapplyCollect,SparkDataFrame-method
1768
1778
# ' @rdname gapplyCollect
1769
1779
# ' @name gapplyCollect
1780
+ # ' @details
1781
+ # ' \code{func} is a function of two arguments. The first, usually named \code{key}
1782
+ # ' (though this is not enforced) corresponds to the grouping key, will be an
1783
+ # ' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1784
+ # ' to the grouping columns' values for the current group.
1785
+ # '
1786
+ # ' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1787
+ # ' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1788
+ # '
1789
+ # ' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1790
+ # ' in particular this means the names of the output \code{data.frame} are irrelevant
1791
+ # '
1770
1792
# ' @seealso \link{gapply}
1771
1793
# ' @examples
1772
1794
# '
1773
1795
# ' \dontrun{
1774
- # ' Computes the arithmetic mean of the second column by grouping
1775
- # ' on the first and third columns. Output the grouping values and the average.
1796
+ # ' # Computes the arithmetic mean of the second column by grouping
1797
+ # ' # on the first and third columns. Output the grouping values and the average.
1776
1798
# '
1777
1799
# ' df <- createDataFrame (
1778
1800
# ' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
@@ -1787,8 +1809,8 @@ setMethod("gapply",
1787
1809
# ' y
1788
1810
# ' })
1789
1811
# '
1790
- # ' We can also group the data and afterwards call gapply on GroupedData.
1791
- # ' For Example :
1812
+ # ' # We can also group the data and afterwards call gapply on GroupedData.
1813
+ # ' # For example :
1792
1814
# ' gdf <- group_by(df, "a", "c")
1793
1815
# ' result <- gapplyCollect(
1794
1816
# ' gdf,
@@ -1798,15 +1820,15 @@ setMethod("gapply",
1798
1820
# ' y
1799
1821
# ' })
1800
1822
# '
1801
- # ' Result
1802
- # ' ------
1803
- # ' key_a key_c mean_b
1804
- # ' 3 3 3.0
1805
- # ' 1 1 1.5
1823
+ # ' # Result
1824
+ # ' # ------
1825
+ # ' # key_a key_c mean_b
1826
+ # ' # 3 3 3.0
1827
+ # ' # 1 1 1.5
1806
1828
# '
1807
- # ' Fits linear models on iris dataset by grouping on the 'Species' column and
1808
- # ' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1809
- # ' and 'Petal_Width' as training features.
1829
+ # ' # Fits linear models on iris dataset by grouping on the 'Species' column and
1830
+ # ' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1831
+ # ' # and 'Petal_Width' as training features.
1810
1832
# '
1811
1833
# ' df <- createDataFrame (iris)
1812
1834
# ' result <- gapplyCollect(
@@ -1818,12 +1840,12 @@ setMethod("gapply",
1818
1840
# ' data.frame(t(coef(m)))
1819
1841
# ' })
1820
1842
# '
1821
- # ' Result
1822
- # '---------
1823
- # ' Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1824
- # ' 1 0.699883 0.3303370 0.9455356 -0.1697527
1825
- # ' 2 1.895540 0.3868576 0.9083370 -0.6792238
1826
- # ' 3 2.351890 0.6548350 0.2375602 0.2521257
1843
+ # ' # Result
1844
+ # ' # ---------
1845
+ # ' # Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1846
+ # ' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1847
+ # ' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1848
+ # ' # 3 2.351890 0.6548350 0.2375602 0.2521257
1827
1849
# '
1828
1850
# '}
1829
1851
# ' @note gapplyCollect(SparkDataFrame) since 2.0.0
@@ -2735,10 +2757,10 @@ setMethod("merge",
2735
2757
colY <- joinY [[i ]]
2736
2758
2737
2759
if (colX %in% by ) {
2738
- colX <- paste (colX , suffixes [1 ], sep = " " )
2760
+ colX <- paste0 (colX , suffixes [1 ])
2739
2761
}
2740
2762
if (colY %in% by ) {
2741
- colY <- paste (colY , suffixes [2 ], sep = " " )
2763
+ colY <- paste0 (colY , suffixes [2 ])
2742
2764
}
2743
2765
2744
2766
colX <- getColumn(xsel , colX )
@@ -2753,7 +2775,7 @@ setMethod("merge",
2753
2775
2754
2776
# sorts the result by 'by' columns if sort = TRUE
2755
2777
if (sort && length(by ) > 0 ) {
2756
- colNameWithSuffix <- paste (by , suffixes [2 ], sep = " " )
2778
+ colNameWithSuffix <- paste0 (by , suffixes [2 ])
2757
2779
joinRes <- do.call(" arrange" , c(joinRes , colNameWithSuffix , decreasing = FALSE ))
2758
2780
}
2759
2781
@@ -2776,7 +2798,7 @@ genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) {
2776
2798
cols <- lapply(allColNames , function (colName ) {
2777
2799
col <- getColumn(x , colName )
2778
2800
if (colName %in% intersectedColNames ) {
2779
- newJoin <- paste (colName , suffix , sep = " " )
2801
+ newJoin <- paste0 (colName , suffix )
2780
2802
if (newJoin %in% allColNames ) {
2781
2803
stop(" The following column name: " , newJoin , " occurs more than once in the 'DataFrame'." ,
2782
2804
" Please use different suffixes for the intersected columns." )
0 commit comments