-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathdata_types__exercises.py
87 lines (59 loc) · 1.83 KB
/
data_types__exercises.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
spark = SparkSession.builder.getOrCreate()
# %%
some_timestamps = spark.createDataFrame(
[["2019-04-01"], ["2020-07-17"], ["1994-12-03"]],
["as_string"],
)
some_timestamps.printSchema()
# %%
some_timestamps = some_timestamps.withColumn("as_date", F.col("as_string").cast(T.DateType()))
some_timestamps.printSchema()
# %%
some_timestamps = some_timestamps.withColumn("as_timestamp", F.col("as_date").cast(T.TimestampType()))
some_timestamps.printSchema()
# %%
some_timestamps.show()
# %%
data = [
["1.0", "2020-04-07", "3"],
["1042,5", "2015-06-19", "17,042,174"],
["17.03.04178", "2019/12/25", "17_092"],
]
schema = T.StructType(
[
T.StructField("number_with_decimal", T.StringType()),
T.StructField("dates_inconsistently_formatted", T.StringType()),
T.StructField("integer_with_separators", T.StringType()),
]
)
cast_df = spark.createDataFrame(data, schema)
cast_df.show(3, False)
# %%
cast_df = cast_df.select(
F.col("number_with_decimal")
.cast(T.DoubleType())
.alias("number_with_decimal"),
F.col("dates_inconsistently_formatted")
.cast(T.DateType())
.alias("dates_inconsistently_formatted"),
F.col("integer_with_separators")
.cast(T.LongType())
.alias("integer_with_separators"),
)
cast_df.show(3, False)
# %%
cast_df.fillna("2020, 1, 1", ["dates_inconsistently_formatted"]).show()
# %%
from datetime import date as d
cast_df = cast_df.withColumn(
"dates_inconsistently_formatted",
F.when(F.isnull(F.col("dates_inconsistently_formatted")), d(2020, 1, 1))
.otherwise(F.col("dates_inconsistently_formatted"))
)
cast_df.show()