Skip to content

Commit

Permalink
Rearranging examples.
Browse files Browse the repository at this point in the history
  • Loading branch information
isabekov committed Nov 15, 2024
1 parent 2fdd5be commit a21ace0
Showing 1 changed file with 27 additions and 27 deletions.
54 changes: 27 additions & 27 deletions pyspark_cookbook.org
Original file line number Diff line number Diff line change
Expand Up @@ -1529,33 +1529,6 @@ Using split function and remove last character:
| on a | Prefix_on a |

* Time operations
** To calculate cumulative sum of a column
#+BEGIN_SRC python :post pretty2orgtbl(data=*this*)
import pandas as pd
from pyspark.sql import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("test-app").getOrCreate()
df = pd.DataFrame({'time': [0, 1, 2, 3, 4, 5],
'value': [False, False, True, False, True, True]})

df = spark.createDataFrame(df)
df = df.withColumn("cml_n_true", F.sum((F.col("value") == True).cast("int")).over(Window.orderBy(F.col("time").asc())))
df = df.withColumn("cml_n_false", F.sum((F.col("value") == False).cast("int")).over(Window.orderBy(F.col("time").asc())))
df.show()
#+END_SRC

#+RESULTS:
|time|value|cml_n_true|cml_n_false|
|----+-----+----------+-----------|
| 0|false| 0| 1|
| 1|false| 0| 2|
| 2| true| 1| 2|
| 3|false| 1| 3|
| 4| true| 2| 3|
| 5| true| 3| 3|

** To convert Unix time stamp to human readable format
#+BEGIN_SRC python :post pretty2orgtbl(data=*this*)
import pyspark.sql.functions as F
Expand Down Expand Up @@ -1913,6 +1886,33 @@ root
|{Craig, , Brown} |1011|M |88 |92 |88 |
:end:

** To calculate cumulative sum of a column
#+BEGIN_SRC python :post pretty2orgtbl(data=*this*)
import pandas as pd
from pyspark.sql import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("test-app").getOrCreate()
df = pd.DataFrame({'time': [0, 1, 2, 3, 4, 5],
'value': [False, False, True, False, True, True]})

df = spark.createDataFrame(df)
df = df.withColumn("cml_n_true", F.sum((F.col("value") == True).cast("int")).over(Window.orderBy(F.col("time").asc())))
df = df.withColumn("cml_n_false", F.sum((F.col("value") == False).cast("int")).over(Window.orderBy(F.col("time").asc())))
df.show()
#+END_SRC

#+RESULTS:
|time|value|cml_n_true|cml_n_false|
|----+-----+----------+-----------|
| 0|false| 0| 1|
| 1|false| 0| 2|
| 2| true| 1| 2|
| 3|false| 1| 3|
| 4| true| 2| 3|
| 5| true| 3| 3|

** To calculate difference of values of two consecutive rows for a certain column
#+BEGIN_SRC python :post pretty2orgtbl(data=*this*)
import pyspark.sql.functions as F
Expand Down

0 comments on commit a21ace0

Please sign in to comment.