SlideShare a Scribd company logo
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, TimestampType, DateType
from pyspark.sql.functions import sum, col, count, when, lead, collect_list, length, year, countDistinct, round, first, lit
from datetime import datetime
spark = SparkSession.builder.master('local[*]').appName('PysparkDataPipeline').getOrCreate()
Creating an empty Df -
schema = StructType([StructField('column_a', IntegerType(), True),
StructField('column_b', StringType(), False)])
df = spark.createDataFrame([], schema=schema)
df.show()
+--------+--------+
|column_a|column_b|
+--------+--------+
+--------+--------+
Append Data into it
df2 = spark.createDataFrame([[1, 'a'],
[2, 'b'],
[3, 'c'],
[None, 'd']], schema=schema)
df2.show()
+--------+--------+
|column_a|column_b|
+--------+--------+
| 1| a|
| 2| b|
| 3| c|
| null| d|
+--------+--------+
df = df.union(df2)
df.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
+--------+--------+
|column_a|column_b|
+--------+--------+
| 1| a|
| 2| b|
| 3| c|
| null| d|
+--------+--------+
Fill Null Values
df = df.fillna(1, subset=['column_a'])
df.show()
+--------+--------+
|column_a|column_b|
+--------+--------+
| 1| a|
| 2| b|
| 3| c|
| 1| d|
+--------+--------+
Replace all 1's with 2's
from pyspark.sql.functions import when
df = df.replace(1, 2, subset=['column_a'])
df.show()
+--------+--------+
|column_a|column_b|
+--------+--------+
| 2| a|
| 2| b|
| 3| c|
| 2| d|
+--------+--------+
LeetCode SQL problem
Replace Employee ID With The Unique Identifier
https://leetcode.com/problems/replace-employee-id-with-the-unique-identifier/description/?envType=study-plan-v2&envId=top-sql-50
In [ ]:
In [ ]:
In [ ]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
employees_schema = StructType([StructField('id', IntegerType(), False),
StructField('name', StringType(), True)])
employees_df = spark.createDataFrame([[1, 'Alice'],
[7, 'Bob'],
[11, 'Meir'],
[90, 'Winston'],
[3, 'Jonathan']], schema=employees_schema)
employee_uni_schema = StructType([StructField('id', IntegerType(), False),
StructField('unique_id', IntegerType(), True)])
employee_uni_df = spark.createDataFrame([[3, 1],
[11, 2],
[90, 3]], schema=employee_uni_schema)
employees_df.show()
+---+--------+
| id| name|
+---+--------+
| 1| Alice|
| 7| Bob|
| 11| Meir|
| 90| Winston|
| 3|Jonathan|
+---+--------+
employee_uni_df.show()
+---+---------+
| id|unique_id|
+---+---------+
| 3| 1|
| 11| 2|
| 90| 3|
+---+---------+
employees_df.join(employee_uni_df, how='left', on=['id']).select('unique_id', 'name').show()
+---------+--------+
|unique_id| name|
+---------+--------+
| null| Alice|
| null| Bob|
| 2| Meir|
| 1|Jonathan|
| 3| Winston|
+---------+--------+
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Leetcode SQL problem - Confirmation Rate
https://leetcode.com/problems/confirmation-rate/description/?envType=study-plan-v2&envId=top-sql-50
signups_schema = StructType([StructField('user_id', IntegerType(), False),
StructField('time_stamp', TimestampType(), True)])
confirmations_schema = StructType([StructField('user_id', IntegerType(), True),
StructField('time_stamp', TimestampType(), True),
StructField('action', StringType(), True)])
sinups_df = spark.createDataFrame([[3, datetime.strptime('2020-03-21 10:16:13', '%Y-%m-%d %H:%M:%S')],
[7, datetime.strptime('2020-01-04 13:57:59', '%Y-%m-%d %H:%M:%S')],
[2, datetime.strptime('2020-07-29 23:09:44', '%Y-%m-%d %H:%M:%S')],
[6, datetime.strptime('2020-12-09 10:39:37', '%Y-%m-%d %H:%M:%S')]], schema=signups_schema)
confirmations_df = spark.createDataFrame([[3, datetime.strptime('2021-01-06 03:30:46', '%Y-%m-%d %H:%M:%S'), 'timeout'],
[3, datetime.strptime('2021-07-14 14:00:00', '%Y-%m-%d %H:%M:%S'), 'timeout'],
[7, datetime.strptime('2021-06-12 11:57:29', '%Y-%m-%d %H:%M:%S'), 'confirmed'],
[7, datetime.strptime('2021-06-13 12:58:28', '%Y-%m-%d %H:%M:%S'), 'confirmed'],
[7, datetime.strptime('2021-06-14 13:59:27', '%Y-%m-%d %H:%M:%S'), 'confirmed'],
[2, datetime.strptime('2021-01-22 00:00:00', '%Y-%m-%d %H:%M:%S'), 'confirmed'],
[2, datetime.strptime('2021-02-28 23:59:59', '%Y-%m-%d %H:%M:%S'), 'timeout']], schema=confirmations_schema)
result_df = sinups_df.join(confirmations_df, how='left', on=['user_id'])
result_df.groupBy('user_id').agg((sum(when(result_df.action == 'confirmed', 1).otherwise(
0.00)) / count('*')).alias('confirmation_rate')).show()
+-------+-----------------+
|user_id|confirmation_rate|
+-------+-----------------+
| 3| 0.0|
| 7| 1.0|
| 2| 0.5|
| 6| 0.0|
+-------+-----------------+
Find Users With Valid E-Mails
https://leetcode.com/problems/find-users-with-valid-e-mails/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata
schema = StructType([StructField('user_id', IntegerType(), False),
StructField('name', StringType(), True),
StructField('mail', StringType(), True)])
df = spark.createDataFrame([[1, 'Winston', 'winston@leetcode.com'],
[2, 'Jonathan', 'jonathanisgreat'],
[3, 'Annabelle', 'bella-@leetcode.com'],
[4, 'Sally', 'sally.come@leetcode.com'],
[5, 'Marwan', 'quarz#2020@leetcode.com'],
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
[6, 'David', 'david69@gmail.com'],
[7, 'Shapiro', '.shapo@leetcode.com']], schema=schema)
df.show()
+-------+---------+--------------------+
|user_id| name| mail|
+-------+---------+--------------------+
| 1| Winston|winston@leetcode.com|
| 2| Jonathan| jonathanisgreat|
| 3|Annabelle| bella-@leetcode.com|
| 4| Sally|sally.come@leetco...|
| 5| Marwan|quarz#2020@leetco...|
| 6| David| david69@gmail.com|
| 7| Shapiro| .shapo@leetcode.com|
+-------+---------+--------------------+
df.filter(col('mail').rlike('^[a-zA-Z][a-zA-Z0-9_.-]*@leetcode.com')).show()
+-------+---------+--------------------+
|user_id| name| mail|
+-------+---------+--------------------+
| 1| Winston|winston@leetcode.com|
| 3|Annabelle| bella-@leetcode.com|
| 4| Sally|sally.come@leetco...|
+-------+---------+--------------------+
Consecutive Numbers
https://leetcode.com/problems/consecutive-numbers/
schema = StructType([StructField('id', IntegerType(), False),
StructField('num', StringType(), True)])
df = spark.createDataFrame([[1, '1'],
[2, '1'],
[3, '1'],
[4, '2'],
[5, '1'],
[6, '2'],
[7, '2']], schema=schema)
df.createOrReplaceTempView('df')
spark.sql('''with temp as (
select num,
lead(num, 1) over (order by id) num1,
lead(num, 2) over (order by id) num2
from df
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
)
select distinct num
from temp
where num = num1
and num = num2''').show()
+---+
|num|
+---+
| 1|
+---+
23/08/03 15:05:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:05:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:05:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:05:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
df.withColumn('num1', lead('num', 1).over(Window.orderBy('id')))
.withColumn('num2', lead('num', 2).over(Window.orderBy('id')))
.filter((col('num') == col('num1')) & (col('num') == col('num2')))
.select('id').distinct().show()
+---+
| id|
+---+
| 1|
+---+
23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
Students and classes example
schema = StructType([StructField('class', IntegerType(), True),
StructField('student_id', IntegerType(), True),
StructField('term', IntegerType(), True),
StructField('subject', StringType(), True),
StructField('marks', IntegerType(), True)])
df = spark.createDataFrame([[2, 1, 1, 'maths', 10],
In [ ]:
In [ ]:
[2, 1, 2, 'maths', 12],
[2, 1, 1, 'english', 14],
[2, 1, 2, 'english', 12],
[3, 2, 1, 'maths', 10],
[3, 2, 2, 'maths', 12],
[3, 2, 1, 'english', 16],
[3, 2, 2, 'english', 14]], schema=schema)
df.show()
+-----+----------+----+-------+-----+
|class|student_id|term|subject|marks|
+-----+----------+----+-------+-----+
| 2| 1| 1| maths| 10|
| 2| 1| 2| maths| 12|
| 2| 1| 1|english| 14|
| 2| 1| 2|english| 12|
| 3| 2| 1| maths| 10|
| 3| 2| 2| maths| 12|
| 3| 2| 1|english| 16|
| 3| 2| 2|english| 14|
+-----+----------+----+-------+-----+
Get class 2 students in following format --> class student_id subject term1 term2
df = df.orderBy('class', 'student_id', 'subject')
.groupBy('class', 'student_id', 'subject')
.agg(collect_list('marks')[0].alias('term1'), collect_list('marks')[1].alias('term2'))
df.show()
+-----+----------+-------+-----+-----+
|class|student_id|subject|term1|term2|
+-----+----------+-------+-----+-----+
| 2| 1|english| 14| 12|
| 2| 1| maths| 10| 12|
| 3| 2|english| 16| 14|
| 3| 2| maths| 10| 12|
+-----+----------+-------+-----+-----+
df.filter(col('class') == 2).show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
+-----+----------+-------+-----+-----+
|class|student_id|subject|term1|term2|
+-----+----------+-------+-----+-----+
| 2| 1|english| 14| 12|
| 2| 1| maths| 10| 12|
+-----+----------+-------+-----+-----+
Get subject-wise aggregated score with 25% weightage to term1 and 75% weightage to term2
df = spark.createDataFrame([[2, 1, 1, 'maths', 10],
[2, 1, 2, 'maths', 12],
[2, 1, 1, 'english', 14],
[2, 1, 2, 'english', 12],
[3, 2, 1, 'maths', 10],
[3, 2, 2, 'maths', 12],
[3, 2, 1, 'english', 16],
[3, 2, 2, 'english', 14]], schema=schema)
df.show()
+-----+----------+----+-------+-----+
|class|student_id|term|subject|marks|
+-----+----------+----+-------+-----+
| 2| 1| 1| maths| 10|
| 2| 1| 2| maths| 12|
| 2| 1| 1|english| 14|
| 2| 1| 2|english| 12|
| 3| 2| 1| maths| 10|
| 3| 2| 2| maths| 12|
| 3| 2| 1|english| 16|
| 3| 2| 2|english| 14|
+-----+----------+----+-------+-----+
maths_agg = 
df.filter(col('subject') == 'maths')
.orderBy('student_id', 'term')
.groupBy('student_id')
.agg((collect_list('marks')[0] * 0.25 + collect_list('marks')[1] * 0.75).alias('maths_agg'))
english_agg = 
df.filter(col('subject') == 'english')
.orderBy('student_id', 'term')
.groupBy('student_id')
.agg((collect_list('marks')[0] * 0.25 + collect_list('marks')[1] * 0.75).alias('english_agg'))
maths_agg.join(english_agg, how='inner', on='student_id').show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
+----------+---------+-----------+
|student_id|maths_agg|english_agg|
+----------+---------+-----------+
| 1| 11.5| 12.5|
| 2| 11.5| 14.5|
+----------+---------+-----------+
Exchange Seats -
https://leetcode.com/problems/exchange-seats/description/
import pandas as pd
import io
data = '''
id,student
1,Abbot
2,Doris
3,Emerson
4,Green
5,Jeames
'''
df = spark.createDataFrame(pd.read_csv(io.StringIO(data), header=0))
df.show()
+---+-------+
| id|student|
+---+-------+
| 1| Abbot|
| 2| Doris|
| 3|Emerson|
| 4| Green|
| 5| Jeames|
+---+-------+
max_id = max(df.select('id').collect())[0]
df.select(when(col('id') % 2 == 1,
lead('id', 1, max_id).over(Window.orderBy('id')))
.otherwise(col('id') - 1).alias('id'),
'Student').orderBy('id').show()
In [ ]:
In [ ]:
In [ ]:
+---+-------+
| id|Student|
+---+-------+
| 1| Doris|
| 2| Abbot|
| 3| Green|
| 4|Emerson|
| 5| Jeames|
+---+-------+
23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio
n.
Tree Node
https://leetcode.com/problems/tree-node/
schema = StructType([StructField('id', IntegerType(), False),
StructField('p_id', IntegerType(), True)])
df = spark.createDataFrame(
[[1, None],
[2, 1],
[3, 1],
[4, 2],
[5, 2]], schema=schema)
df.show()
+---+----+
| id|p_id|
+---+----+
| 1|null|
| 2| 1|
| 3| 1|
| 4| 2|
| 5| 2|
+---+----+
distinct_parent_ids = df.select(
'p_id').distinct().rdd.flatMap(lambda x: x).collect()
In [ ]:
In [ ]:
In [ ]:
df.select('id',
when(col('p_id').isNull(), 'Root')
.when(col('p_id').isNotNull() & col('id').isin(distinct_parent_ids), 'Inner')
.otherwise('Leaf').alias('type')).show()
+---+-----+
| id| type|
+---+-----+
| 1| Root|
| 2|Inner|
| 3| Leaf|
| 4| Leaf|
| 5| Leaf|
+---+-----+
Pandas
pdf = df.toPandas()
pdf
id p_id
0 1 NaN
1 2 1.0
2 3 1.0
3 4 2.0
4 5 2.0
import numpy as np
pdf['type'] = np.where(pdf['p_id'].isna(),
'Root',
np.where(pdf['id'].isin(pdf['p_id'].unique()) & pdf['p_id'].notna(),
'Inner',
'Leaf'))
pdf[['id', 'type']]
In [ ]:
In [ ]:
Out[ ]:
In [ ]:
In [ ]:
id type
0 1 Root
1 2 Inner
2 3 Leaf
3 4 Leaf
4 5 Leaf
Customers who bought all products
https://leetcode.com/problems/customers-who-bought-all-products/description/
customer_schema = StructType([StructField('customer_id', IntegerType(), False),
StructField('product_key', IntegerType(), False)])
product_schema = StructType([StructField('product_key', IntegerType(), False)])
customer_df = spark.createDataFrame([[1, 5],
[2, 6],
[3, 5],
[3, 6],
[1, 6]], schema=customer_schema)
product_df = spark.createDataFrame([[5], [6]], schema=product_schema)
customer_df.show()
+-----------+-----------+
|customer_id|product_key|
+-----------+-----------+
| 1| 5|
| 2| 6|
| 3| 5|
| 3| 6|
| 1| 6|
+-----------+-----------+
product_df.show()
+-----------+
|product_key|
+-----------+
| 5|
| 6|
+-----------+
Out[ ]:
In [ ]:
In [ ]:
In [ ]:
grouped_df = customer_df.dropDuplicates().groupBy(
'customer_id').agg(count(col('customer_id')).alias('count'))
grouped_df.show()
+-----------+-----+
|customer_id|count|
+-----------+-----+
| 1| 2|
| 3| 2|
| 2| 1|
+-----------+-----+
grouped_df.filter(col('count') == product_df.count()).select('customer_id').show()
+-----------+
|customer_id|
+-----------+
| 1|
| 3|
+-----------+
Triangle Judgement
https://leetcode.com/problems/triangle-judgement/description/
schema = StructType([StructField('x', IntegerType(), False),
StructField('y', IntegerType(), False),
StructField('z', IntegerType(), False)])
df = spark.createDataFrame([[13, 15, 30],
[10, 20, 15]], schema=schema)
df.show()
+---+---+---+
| x| y| z|
+---+---+---+
| 13| 15| 30|
| 10| 20| 15|
+---+---+---+
df.withColumn('triangle', when((col('x') + col('y') > col('z'))
& (col('y') + col('z') > col('x'))
& (col('x') + col('z') > col('y')), 'Yes')
.otherwise('No')).show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
+---+---+---+--------+
| x| y| z|triangle|
+---+---+---+--------+
| 13| 15| 30| No|
| 10| 20| 15| Yes|
+---+---+---+--------+
Invalid Tweets
https://leetcode.com/problems/invalid-tweets/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata
schema = StructType([StructField('tweet_id', IntegerType(), False),
StructField('content', StringType(), True)])
df = spark.createDataFrame([[1, 'Vote for Biden'],
[2, 'Let us make America great again!']], schema=schema)
df.show(truncate=False)
+--------+--------------------------------+
|tweet_id|content |
+--------+--------------------------------+
|1 |Vote for Biden |
|2 |Let us make America great again!|
+--------+--------------------------------+
df.filter(length(col('content')) > 15).select('tweet_id').show()
+--------+
|tweet_id|
+--------+
| 2|
+--------+
Calculate Special Bonus
https://leetcode.com/problems/calculate-special-bonus/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata
schema = StructType([StructField('employee_id', IntegerType(), False),
StructField('name', StringType(), True),
StructField('salary', IntegerType(), True)])
df = spark.createDataFrame([[2, 'Meir', 3000],
[3, 'Michael', 3800],
[7, 'Addilyn', 7400],
[8, 'Juan', 6100],
[9, 'Kannon', 7700]], schema=schema)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
df.show(truncate=False)
+-----------+-------+------+
|employee_id|name |salary|
+-----------+-------+------+
|2 |Meir |3000 |
|3 |Michael|3800 |
|7 |Addilyn|7400 |
|8 |Juan |6100 |
|9 |Kannon |7700 |
+-----------+-------+------+
df.withColumn('bonus', when((col('employee_id') % 2 == 1) & (~col('name').startswith(
'M')), col('salary')).otherwise(0).alias('bonus')).select('employee_id', 'bonus').show()
+-----------+-----+
|employee_id|bonus|
+-----------+-----+
| 2| 0|
| 3| 0|
| 7| 7400|
| 8| 0|
| 9| 7700|
+-----------+-----+
Market Analysis I
https://leetcode.com/problems/market-analysis-i/description/
users_schema = StructType([StructField('user_id', IntegerType(), False),
StructField('join_date', StringType(), False),
StructField('favorite_brand', StringType(), False)])
users_df = spark.createDataFrame([[1, '2018-01-01', 'Lenovo'],
[2, '2018-02-09', 'Samsung'],
[3, '2018-01-09', 'LG'],
[4, '2018-05-21', 'HP']], schema=users_schema)
orders_schema = StructType([StructField('order_id', IntegerType(), False),
StructField('order_date', StringType(), False),
StructField('item_id', IntegerType(), False),
StructField('buyer_id', IntegerType(), False),
StructField('seller_id', IntegerType(), False)])
orders_df = spark.createDataFrame([[1, '2019-08-01', 4, 1, 2],
[2, '2018-08-02', 2, 1, 3],
[3, '2019-08-03', 3, 2, 3],
[4, '2018-08-04', 1, 4, 2],
[5, '2018-08-04', 1, 3, 4],
[6, '2019-08-05', 2, 2, 4]], schema=orders_schema)
In [ ]:
In [ ]:
In [ ]:
users_df = users_df.withColumn('join_date', col('join_date').cast('date'))
orders_df = orders_df.withColumn('order_date', col('order_date').cast('date'))
users_df.printSchema()
root
|-- user_id: integer (nullable = false)
|-- join_date: date (nullable = true)
|-- favorite_brand: string (nullable = false)
orders_df.printSchema()
root
|-- order_id: integer (nullable = false)
|-- order_date: date (nullable = true)
|-- item_id: integer (nullable = false)
|-- buyer_id: integer (nullable = false)
|-- seller_id: integer (nullable = false)
orders_temp = orders_df.filter(year(col('order_date')) == 2019).groupBy(
col('buyer_id')).agg(count('order_id').alias('orders_in_2019'))
orders_temp.show()
+--------+--------------+
|buyer_id|orders_in_2019|
+--------+--------------+
| 1| 1|
| 2| 2|
+--------+--------------+
users_df.join(orders_temp, users_df.user_id == orders_temp.buyer_id, how='left')
.select(col('user_id').alias('buyer_id'), 'join_date', 'orders_in_2019')
.fillna(0, subset=['orders_in_2019']).show()
+--------+----------+--------------+
|buyer_id| join_date|orders_in_2019|
+--------+----------+--------------+
| 1|2018-01-01| 1|
| 2|2018-02-09| 2|
| 3|2018-01-09| 0|
| 4|2018-05-21| 0|
+--------+----------+--------------+
Department Highest Salary
https://leetcode.com/problems/department-highest-salary/
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
employee_schema = StructType([StructField('id', IntegerType(), False),
StructField('name', StringType(), False),
StructField('salary', IntegerType(), False),
StructField('departmentId', IntegerType(), False)])
employee_df = spark.createDataFrame([[1, 'Joe', 70000, 1],
[2, 'Jim', 90000, 1],
[3, 'Henry', 80000, 2],
[4, 'Sam', 60000, 2],
[5, 'Max', 90000, 1]], schema=employee_schema)
department_schema = StructType([StructField('id', IntegerType(), False),
StructField('name', StringType(), False)])
department_df = spark.createDataFrame([[1, 'IT'],
[2, 'Sales']], schema=department_schema)
employee_df.show()
+---+-----+------+------------+
| id| name|salary|departmentId|
+---+-----+------+------------+
| 1| Joe| 70000| 1|
| 2| Jim| 90000| 1|
| 3|Henry| 80000| 2|
| 4| Sam| 60000| 2|
| 5| Max| 90000| 1|
+---+-----+------+------------+
department_df.show()
+---+-----+
| id| name|
+---+-----+
| 1| IT|
| 2|Sales|
+---+-----+
max_salary = employee_df.groupby(col('departmentId')).max(
'salary').withColumnRenamed('max(salary)', 'max_salary')
max_salary.show()
+------------+----------+
|departmentId|max_salary|
+------------+----------+
| 1| 90000|
| 2| 80000|
+------------+----------+
In [ ]:
In [ ]:
In [ ]:
In [ ]:
max_salary = max_salary.withColumnRenamed('departmentId', 'depId')
employee_df = employee_df.withColumnRenamed('name', 'empName')
employee_df.join(max_salary,
(employee_df['departmentId'] == max_salary['depId'])
& (employee_df['salary'] == max_salary['max_salary']),
how='inner')
.join(department_df,
employee_df['departmentId'] == department_df['id'],
how='inner')
.select(col('name').alias('Department'), col('empName').alias('Employee'), col('salary').alias('Salary')).show(truncate=False)
+----------+--------+------+
|Department|Employee|Salary|
+----------+--------+------+
|IT |Max |90000 |
|IT |Jim |90000 |
|Sales |Henry |80000 |
+----------+--------+------+
The Number of Rich Customers
https://leetcode.com/problems/the-number-of-rich-customers/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata
import pandas as pd
import io
df = pd.read_csv(io.StringIO('''
bill_id,customer_id,amount
6,1,549
8,1,834
4,2,394
11,3,657
13,3,257
'''))
df
bill_id customer_id amount
0 6 1 549
1 8 1 834
2 4 2 394
3 11 3 657
4 13 3 257
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Out[ ]:
df = spark.createDataFrame(df)
df.show()
+-------+-----------+------+
|bill_id|customer_id|amount|
+-------+-----------+------+
| 6| 1| 549|
| 8| 1| 834|
| 4| 2| 394|
| 11| 3| 657|
| 13| 3| 257|
+-------+-----------+------+
df.filter(col('amount') > 500).select(countDistinct(col('customer_id')).alias('rich_count')).show()
+----------+
|rich_count|
+----------+
| 2|
+----------+
Project Employees I
https://leetcode.com/problems/project-employees-i/description/
project_schema = StructType([StructField('project_id', IntegerType(), False),
StructField('employee_id', IntegerType(), True)])
project_df = spark.createDataFrame([[1, 1],
[1, 2],
[1, 3],
[2, 1],
[2, 4]], schema=project_schema)
employee_schema = StructType([StructField('employee_id', IntegerType(), False),
StructField('name', StringType(), False),
StructField('experience_years', IntegerType(), True)])
employee_df = spark.createDataFrame([[1, 'Khaled', 3],
[2, 'Ali', 2],
[3, 'John', 1],
[4, 'Doe', 2]], schema=employee_schema)
project_df.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
+----------+-----------+
|project_id|employee_id|
+----------+-----------+
| 1| 1|
| 1| 2|
| 1| 3|
| 2| 1|
| 2| 4|
+----------+-----------+
employee_df.show()
+-----------+------+----------------+
|employee_id| name|experience_years|
+-----------+------+----------------+
| 1|Khaled| 3|
| 2| Ali| 2|
| 3| John| 1|
| 4| Doe| 2|
+-----------+------+----------------+
project_df.join(employee_df, on='employee_id', how='inner')
.groupBy('project_id').avg('experience_years')
.select(col('project_id'), round(col('avg(experience_years)'), 2).alias('average_years')).show()
+----------+-------------+
|project_id|average_years|
+----------+-------------+
| 1| 2.0|
| 2| 2.5|
+----------+-------------+
Customer Who Visited but Did Not Make Any Transactions
https://leetcode.com/problems/customer-who-visited-but-did-not-make-any-transactions/description/?envType=study-plan-v2&envId=top-sql-50
visits_schema = StructType([StructField('visit_id', IntegerType(), False),
StructField('customer_id', IntegerType(), False)])
visits_df = spark.createDataFrame([[1, 23],
[2, 9],
[4, 30],
[5, 54],
[6, 96],
[7, 54],
[8, 54]], schema=visits_schema)
transactions_schema = StructType([StructField('transaction_id', IntegerType(), False),
StructField('visit_id', IntegerType(), False),
In [ ]:
In [ ]:
In [ ]:
StructField('amount', IntegerType(), False)])
transactions_df = spark.createDataFrame([[2, 5, 310],
[3, 5, 300],
[9, 5, 200],
[12, 1, 910],
[13, 2, 970]], schema=transactions_schema)
visits_df.show()
+--------+-----------+
|visit_id|customer_id|
+--------+-----------+
| 1| 23|
| 2| 9|
| 4| 30|
| 5| 54|
| 6| 96|
| 7| 54|
| 8| 54|
+--------+-----------+
transactions_df.show()
+--------------+--------+------+
|transaction_id|visit_id|amount|
+--------------+--------+------+
| 2| 5| 310|
| 3| 5| 300|
| 9| 5| 200|
| 12| 1| 910|
| 13| 2| 970|
+--------------+--------+------+
If transactions_df is small in size -
unique_visit_ids_who_transacted = [i[0] for i in transactions_df.select('visit_id').distinct().collect()]
visits_df.filter(~visits_df['visit_id'].isin(unique_visit_ids_who_transacted))
.groupBy('customer_id').count()
.withColumnRenamed('count', 'count_no_trans').show()
+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
| 30| 1|
| 96| 1|
| 54| 2|
+-----------+--------------+
In [ ]:
In [ ]:
In [ ]:
If transactions_df is large to run collect(), we can use left join and picking records with null transaction_id
visits_df.join(transactions_df, on='visit_id', how='left')
.filter(col('transaction_id').isNull())
.groupBy('customer_id').agg(count('visit_id'))
.withColumnRenamed('count(visit_id)', 'count_no_trans')
.show()
+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
| 54| 2|
| 96| 1|
| 30| 1|
+-----------+--------------+
Product Price at a Given Date
https://leetcode.com/problems/product-price-at-a-given-date/description/
product_schema = StructType([StructField('product_id', IntegerType(), False),
StructField('new_price', IntegerType(), True),
StructField('change_date', StringType(), True)])
product_df = spark.createDataFrame([[1, 20, '2019-08-14'],
[2, 50, '2019-08-14'],
[1, 30, '2019-08-15'],
[1, 35, '2019-08-16'],
[2, 65, '2019-08-17'],
[3, 20, '2019-08-18']], schema=product_schema).select('product_id', 'new_price', col('change_date').cast('date'))
product_df.show()
+----------+---------+-----------+
|product_id|new_price|change_date|
+----------+---------+-----------+
| 1| 20| 2019-08-14|
| 2| 50| 2019-08-14|
| 1| 30| 2019-08-15|
| 1| 35| 2019-08-16|
| 2| 65| 2019-08-17|
| 3| 20| 2019-08-18|
+----------+---------+-----------+
from pyspark.sql.functions import min as min_
## note - it is important to alias the pyspark min function since otherwise it would use python min() function and break the code
In [ ]:
In [ ]:
In [ ]:
In [ ]:
product_df.filter(col('change_date') <= '2019-08-16')
.select('product_id', first('new_price').over(Window.partitionBy('product_id').orderBy(col('change_date').desc())).alias('price'))
.union(
product_df.groupBy('product_id').agg(min_('change_date').alias('change_date'))
.filter(col('change_date') > '2019-08-16').withColumn('price', lit(10)).select('product_id', 'price')
).dropDuplicates().show()
+----------+-----+
|product_id|price|
+----------+-----+
| 2| 50|
| 1| 35|
| 3| 10|
+----------+-----+
In [ ]:

More Related Content

Similar to LeetCode Database problems solved using PySpark.pdf

Online Movie Ticket Booking
Online Movie Ticket BookingOnline Movie Ticket Booking
Online Movie Ticket Booking
Astha Patel
 
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
Databricks
 
Scaling up data science applications
Scaling up data science applicationsScaling up data science applications
Scaling up data science applications
Kexin Xie
 
Informatics Practices (new) solution CBSE 2021, Compartment, improvement ex...
Informatics Practices (new) solution CBSE  2021, Compartment,  improvement ex...Informatics Practices (new) solution CBSE  2021, Compartment,  improvement ex...
Informatics Practices (new) solution CBSE 2021, Compartment, improvement ex...
FarhanAhmade
 
JAVA.pdf
JAVA.pdfJAVA.pdf
JAVA.pdf
jyotir7777
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
Sander Kieft
 
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdfNeed an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
actexerode
 
Geometric and material nonlinearity analysis of 2 d truss with force and duct...
Geometric and material nonlinearity analysis of 2 d truss with force and duct...Geometric and material nonlinearity analysis of 2 d truss with force and duct...
Geometric and material nonlinearity analysis of 2 d truss with force and duct...
Salar Delavar Qashqai
 
Data Structure in C (Lab Programs)
Data Structure in C (Lab Programs)Data Structure in C (Lab Programs)
Data Structure in C (Lab Programs)
Saket Pathak
 
RDataMining slides-r-programming
RDataMining slides-r-programmingRDataMining slides-r-programming
RDataMining slides-r-programming
Yanchang Zhao
 
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
Maithreya Chakravarthula
 
Protractor framework – how to make stable e2e tests for Angular applications
Protractor framework – how to make stable e2e tests for Angular applicationsProtractor framework – how to make stable e2e tests for Angular applications
Protractor framework – how to make stable e2e tests for Angular applications
Ludmila Nesvitiy
 
Time Series Analysis and Mining with R
Time Series Analysis and Mining with RTime Series Analysis and Mining with R
Time Series Analysis and Mining with R
Yanchang Zhao
 
20.1 Java working with abstraction
20.1 Java working with abstraction20.1 Java working with abstraction
20.1 Java working with abstraction
Intro C# Book
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
Moriyoshi Koizumi
 
LSFMM 2019 BPF Observability
LSFMM 2019 BPF ObservabilityLSFMM 2019 BPF Observability
LSFMM 2019 BPF Observability
Brendan Gregg
 
Zoro123456789123456789123456789123456789
Zoro123456789123456789123456789123456789Zoro123456789123456789123456789123456789
Zoro123456789123456789123456789123456789
Ghh
 
labb123456789123456789123456789123456789
labb123456789123456789123456789123456789labb123456789123456789123456789123456789
labb123456789123456789123456789123456789
Ghh
 
Addressing Scenario
Addressing ScenarioAddressing Scenario
Addressing Scenario
Tara Hardin
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Dan Robinson
 

Similar to LeetCode Database problems solved using PySpark.pdf (20)

Online Movie Ticket Booking
Online Movie Ticket BookingOnline Movie Ticket Booking
Online Movie Ticket Booking
 
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
Scaling Up: How Switching to Apache Spark Improved Performance, Realizability...
 
Scaling up data science applications
Scaling up data science applicationsScaling up data science applications
Scaling up data science applications
 
Informatics Practices (new) solution CBSE 2021, Compartment, improvement ex...
Informatics Practices (new) solution CBSE  2021, Compartment,  improvement ex...Informatics Practices (new) solution CBSE  2021, Compartment,  improvement ex...
Informatics Practices (new) solution CBSE 2021, Compartment, improvement ex...
 
JAVA.pdf
JAVA.pdfJAVA.pdf
JAVA.pdf
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdfNeed an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
 
Geometric and material nonlinearity analysis of 2 d truss with force and duct...
Geometric and material nonlinearity analysis of 2 d truss with force and duct...Geometric and material nonlinearity analysis of 2 d truss with force and duct...
Geometric and material nonlinearity analysis of 2 d truss with force and duct...
 
Data Structure in C (Lab Programs)
Data Structure in C (Lab Programs)Data Structure in C (Lab Programs)
Data Structure in C (Lab Programs)
 
RDataMining slides-r-programming
RDataMining slides-r-programmingRDataMining slides-r-programming
RDataMining slides-r-programming
 
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
 
Protractor framework – how to make stable e2e tests for Angular applications
Protractor framework – how to make stable e2e tests for Angular applicationsProtractor framework – how to make stable e2e tests for Angular applications
Protractor framework – how to make stable e2e tests for Angular applications
 
Time Series Analysis and Mining with R
Time Series Analysis and Mining with RTime Series Analysis and Mining with R
Time Series Analysis and Mining with R
 
20.1 Java working with abstraction
20.1 Java working with abstraction20.1 Java working with abstraction
20.1 Java working with abstraction
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
 
LSFMM 2019 BPF Observability
LSFMM 2019 BPF ObservabilityLSFMM 2019 BPF Observability
LSFMM 2019 BPF Observability
 
Zoro123456789123456789123456789123456789
Zoro123456789123456789123456789123456789Zoro123456789123456789123456789123456789
Zoro123456789123456789123456789123456789
 
labb123456789123456789123456789123456789
labb123456789123456789123456789123456789labb123456789123456789123456789123456789
labb123456789123456789123456789123456789
 
Addressing Scenario
Addressing ScenarioAddressing Scenario
Addressing Scenario
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
 

Recently uploaded

Sea Wave Energy - Renewable Energy Resources
Sea Wave Energy - Renewable Energy ResourcesSea Wave Energy - Renewable Energy Resources
Sea Wave Energy - Renewable Energy Resources
21h16charis
 
Defect Elimination Management - CMMS Success.pdf
Defect Elimination Management - CMMS Success.pdfDefect Elimination Management - CMMS Success.pdf
Defect Elimination Management - CMMS Success.pdf
David Johnston
 
Design and Engineering Module 1 power point
Design and Engineering Module 1 power pointDesign and Engineering Module 1 power point
Design and Engineering Module 1 power point
ssuser76af31
 
Database management system module -3 bcs403
Database management system module -3 bcs403Database management system module -3 bcs403
Database management system module -3 bcs403
Tharani4825
 
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
NoeAranel
 
Mobile Forensics challenges and Extraction process
Mobile Forensics challenges and Extraction processMobile Forensics challenges and Extraction process
Mobile Forensics challenges and Extraction process
Swapnil Gharat
 
VEMC: Trusted Leader in Engineering Solutions
VEMC: Trusted Leader in Engineering SolutionsVEMC: Trusted Leader in Engineering Solutions
VEMC: Trusted Leader in Engineering Solutions
VijayEngineeringandM1
 
Modified O-RAN 5G Edge Reference Architecture using RNN
Modified O-RAN 5G Edge Reference Architecture using RNNModified O-RAN 5G Edge Reference Architecture using RNN
Modified O-RAN 5G Edge Reference Architecture using RNN
ijwmn
 
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
Kiran Kumar Manigam
 
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdfFIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
Dar es Salaam, Tanzania
 
Comerica Inc Annual report summary and financial updates
Comerica Inc Annual report summary and financial updatesComerica Inc Annual report summary and financial updates
Comerica Inc Annual report summary and financial updates
ssuserb8b8c7
 
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
gerogepatton
 
Future Networking v Energy Limits ICTON 2024 Bari Italy
Future Networking v Energy Limits ICTON 2024 Bari ItalyFuture Networking v Energy Limits ICTON 2024 Bari Italy
Future Networking v Energy Limits ICTON 2024 Bari Italy
University of Hertfordshire
 
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Robert Pyke
 
Protect YugabyteDB with Hashicorp Vault.pdf
Protect YugabyteDB with Hashicorp Vault.pdfProtect YugabyteDB with Hashicorp Vault.pdf
Protect YugabyteDB with Hashicorp Vault.pdf
Gwenn Etourneau
 
CATIA V5 Automation VB script.........pdf
CATIA V5 Automation VB script.........pdfCATIA V5 Automation VB script.........pdf
CATIA V5 Automation VB script.........pdf
shahidad729
 
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Robert Pyke
 
Structural Dynamics and Earthquake Engineering
Structural Dynamics and Earthquake EngineeringStructural Dynamics and Earthquake Engineering
Structural Dynamics and Earthquake Engineering
tushardatta
 
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdfR18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
bibej11828
 
Artificial Intelligence Imaging - medical imaging
Artificial Intelligence Imaging - medical imagingArtificial Intelligence Imaging - medical imaging
Artificial Intelligence Imaging - medical imaging
NeeluPari
 

Recently uploaded (20)

Sea Wave Energy - Renewable Energy Resources
Sea Wave Energy - Renewable Energy ResourcesSea Wave Energy - Renewable Energy Resources
Sea Wave Energy - Renewable Energy Resources
 
Defect Elimination Management - CMMS Success.pdf
Defect Elimination Management - CMMS Success.pdfDefect Elimination Management - CMMS Success.pdf
Defect Elimination Management - CMMS Success.pdf
 
Design and Engineering Module 1 power point
Design and Engineering Module 1 power pointDesign and Engineering Module 1 power point
Design and Engineering Module 1 power point
 
Database management system module -3 bcs403
Database management system module -3 bcs403Database management system module -3 bcs403
Database management system module -3 bcs403
 
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
HSE-BMS-009 COSHH & MSDS.pptHSE-BMS-009 COSHH & MSDS.pptSE-BMS-009 COSHH & MSDS.
 
Mobile Forensics challenges and Extraction process
Mobile Forensics challenges and Extraction processMobile Forensics challenges and Extraction process
Mobile Forensics challenges and Extraction process
 
VEMC: Trusted Leader in Engineering Solutions
VEMC: Trusted Leader in Engineering SolutionsVEMC: Trusted Leader in Engineering Solutions
VEMC: Trusted Leader in Engineering Solutions
 
Modified O-RAN 5G Edge Reference Architecture using RNN
Modified O-RAN 5G Edge Reference Architecture using RNNModified O-RAN 5G Edge Reference Architecture using RNN
Modified O-RAN 5G Edge Reference Architecture using RNN
 
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
RAILWAYS, a vital part of our infrastructure, play a crucial role in ensuring...
 
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdfFIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
FIRE OUTBREAK EMERGENCY FINAL REPORT.pdf
 
Comerica Inc Annual report summary and financial updates
Comerica Inc Annual report summary and financial updatesComerica Inc Annual report summary and financial updates
Comerica Inc Annual report summary and financial updates
 
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
The Transformation Risk-Benefit Model of Artificial Intelligence: Balancing R...
 
Future Networking v Energy Limits ICTON 2024 Bari Italy
Future Networking v Energy Limits ICTON 2024 Bari ItalyFuture Networking v Energy Limits ICTON 2024 Bari Italy
Future Networking v Energy Limits ICTON 2024 Bari Italy
 
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
 
Protect YugabyteDB with Hashicorp Vault.pdf
Protect YugabyteDB with Hashicorp Vault.pdfProtect YugabyteDB with Hashicorp Vault.pdf
Protect YugabyteDB with Hashicorp Vault.pdf
 
CATIA V5 Automation VB script.........pdf
CATIA V5 Automation VB script.........pdfCATIA V5 Automation VB script.........pdf
CATIA V5 Automation VB script.........pdf
 
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
Updated Limitations of Simplified Methods for Evaluating the Potential for Li...
 
Structural Dynamics and Earthquake Engineering
Structural Dynamics and Earthquake EngineeringStructural Dynamics and Earthquake Engineering
Structural Dynamics and Earthquake Engineering
 
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdfR18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
R18B.Tech.OpenElectivesWEF2021_22AdmittedBatch1.pdf
 
Artificial Intelligence Imaging - medical imaging
Artificial Intelligence Imaging - medical imagingArtificial Intelligence Imaging - medical imaging
Artificial Intelligence Imaging - medical imaging
 

LeetCode Database problems solved using PySpark.pdf

  • 1. from pyspark.sql import SparkSession, Window from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, TimestampType, DateType from pyspark.sql.functions import sum, col, count, when, lead, collect_list, length, year, countDistinct, round, first, lit from datetime import datetime spark = SparkSession.builder.master('local[*]').appName('PysparkDataPipeline').getOrCreate() Creating an empty Df - schema = StructType([StructField('column_a', IntegerType(), True), StructField('column_b', StringType(), False)]) df = spark.createDataFrame([], schema=schema) df.show() +--------+--------+ |column_a|column_b| +--------+--------+ +--------+--------+ Append Data into it df2 = spark.createDataFrame([[1, 'a'], [2, 'b'], [3, 'c'], [None, 'd']], schema=schema) df2.show() +--------+--------+ |column_a|column_b| +--------+--------+ | 1| a| | 2| b| | 3| c| | null| d| +--------+--------+ df = df.union(df2) df.show() In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 2. +--------+--------+ |column_a|column_b| +--------+--------+ | 1| a| | 2| b| | 3| c| | null| d| +--------+--------+ Fill Null Values df = df.fillna(1, subset=['column_a']) df.show() +--------+--------+ |column_a|column_b| +--------+--------+ | 1| a| | 2| b| | 3| c| | 1| d| +--------+--------+ Replace all 1's with 2's from pyspark.sql.functions import when df = df.replace(1, 2, subset=['column_a']) df.show() +--------+--------+ |column_a|column_b| +--------+--------+ | 2| a| | 2| b| | 3| c| | 2| d| +--------+--------+ LeetCode SQL problem Replace Employee ID With The Unique Identifier https://leetcode.com/problems/replace-employee-id-with-the-unique-identifier/description/?envType=study-plan-v2&envId=top-sql-50 In [ ]: In [ ]: In [ ]:
  • 3. from pyspark.sql.types import StructType, StructField, IntegerType, StringType employees_schema = StructType([StructField('id', IntegerType(), False), StructField('name', StringType(), True)]) employees_df = spark.createDataFrame([[1, 'Alice'], [7, 'Bob'], [11, 'Meir'], [90, 'Winston'], [3, 'Jonathan']], schema=employees_schema) employee_uni_schema = StructType([StructField('id', IntegerType(), False), StructField('unique_id', IntegerType(), True)]) employee_uni_df = spark.createDataFrame([[3, 1], [11, 2], [90, 3]], schema=employee_uni_schema) employees_df.show() +---+--------+ | id| name| +---+--------+ | 1| Alice| | 7| Bob| | 11| Meir| | 90| Winston| | 3|Jonathan| +---+--------+ employee_uni_df.show() +---+---------+ | id|unique_id| +---+---------+ | 3| 1| | 11| 2| | 90| 3| +---+---------+ employees_df.join(employee_uni_df, how='left', on=['id']).select('unique_id', 'name').show() +---------+--------+ |unique_id| name| +---------+--------+ | null| Alice| | null| Bob| | 2| Meir| | 1|Jonathan| | 3| Winston| +---------+--------+ In [ ]: In [ ]: In [ ]: In [ ]:
  • 4. Leetcode SQL problem - Confirmation Rate https://leetcode.com/problems/confirmation-rate/description/?envType=study-plan-v2&envId=top-sql-50 signups_schema = StructType([StructField('user_id', IntegerType(), False), StructField('time_stamp', TimestampType(), True)]) confirmations_schema = StructType([StructField('user_id', IntegerType(), True), StructField('time_stamp', TimestampType(), True), StructField('action', StringType(), True)]) sinups_df = spark.createDataFrame([[3, datetime.strptime('2020-03-21 10:16:13', '%Y-%m-%d %H:%M:%S')], [7, datetime.strptime('2020-01-04 13:57:59', '%Y-%m-%d %H:%M:%S')], [2, datetime.strptime('2020-07-29 23:09:44', '%Y-%m-%d %H:%M:%S')], [6, datetime.strptime('2020-12-09 10:39:37', '%Y-%m-%d %H:%M:%S')]], schema=signups_schema) confirmations_df = spark.createDataFrame([[3, datetime.strptime('2021-01-06 03:30:46', '%Y-%m-%d %H:%M:%S'), 'timeout'], [3, datetime.strptime('2021-07-14 14:00:00', '%Y-%m-%d %H:%M:%S'), 'timeout'], [7, datetime.strptime('2021-06-12 11:57:29', '%Y-%m-%d %H:%M:%S'), 'confirmed'], [7, datetime.strptime('2021-06-13 12:58:28', '%Y-%m-%d %H:%M:%S'), 'confirmed'], [7, datetime.strptime('2021-06-14 13:59:27', '%Y-%m-%d %H:%M:%S'), 'confirmed'], [2, datetime.strptime('2021-01-22 00:00:00', '%Y-%m-%d %H:%M:%S'), 'confirmed'], [2, datetime.strptime('2021-02-28 23:59:59', '%Y-%m-%d %H:%M:%S'), 'timeout']], schema=confirmations_schema) result_df = sinups_df.join(confirmations_df, how='left', on=['user_id']) result_df.groupBy('user_id').agg((sum(when(result_df.action == 'confirmed', 1).otherwise( 0.00)) / count('*')).alias('confirmation_rate')).show() +-------+-----------------+ |user_id|confirmation_rate| +-------+-----------------+ | 3| 0.0| | 7| 1.0| | 2| 0.5| | 6| 0.0| +-------+-----------------+ Find Users With Valid E-Mails https://leetcode.com/problems/find-users-with-valid-e-mails/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata schema = StructType([StructField('user_id', IntegerType(), False), StructField('name', StringType(), True), StructField('mail', StringType(), True)]) df = spark.createDataFrame([[1, 'Winston', 'winston@leetcode.com'], [2, 'Jonathan', 'jonathanisgreat'], [3, 'Annabelle', 'bella-@leetcode.com'], [4, 'Sally', 'sally.come@leetcode.com'], [5, 'Marwan', 'quarz#2020@leetcode.com'], In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 5. [6, 'David', 'david69@gmail.com'], [7, 'Shapiro', '.shapo@leetcode.com']], schema=schema) df.show() +-------+---------+--------------------+ |user_id| name| mail| +-------+---------+--------------------+ | 1| Winston|winston@leetcode.com| | 2| Jonathan| jonathanisgreat| | 3|Annabelle| bella-@leetcode.com| | 4| Sally|sally.come@leetco...| | 5| Marwan|quarz#2020@leetco...| | 6| David| david69@gmail.com| | 7| Shapiro| .shapo@leetcode.com| +-------+---------+--------------------+ df.filter(col('mail').rlike('^[a-zA-Z][a-zA-Z0-9_.-]*@leetcode.com')).show() +-------+---------+--------------------+ |user_id| name| mail| +-------+---------+--------------------+ | 1| Winston|winston@leetcode.com| | 3|Annabelle| bella-@leetcode.com| | 4| Sally|sally.come@leetco...| +-------+---------+--------------------+ Consecutive Numbers https://leetcode.com/problems/consecutive-numbers/ schema = StructType([StructField('id', IntegerType(), False), StructField('num', StringType(), True)]) df = spark.createDataFrame([[1, '1'], [2, '1'], [3, '1'], [4, '2'], [5, '1'], [6, '2'], [7, '2']], schema=schema) df.createOrReplaceTempView('df') spark.sql('''with temp as ( select num, lead(num, 1) over (order by id) num1, lead(num, 2) over (order by id) num2 from df In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 6. ) select distinct num from temp where num = num1 and num = num2''').show() +---+ |num| +---+ | 1| +---+ 23/08/03 15:05:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:05:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:05:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:05:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. df.withColumn('num1', lead('num', 1).over(Window.orderBy('id'))) .withColumn('num2', lead('num', 2).over(Window.orderBy('id'))) .filter((col('num') == col('num1')) & (col('num') == col('num2'))) .select('id').distinct().show() +---+ | id| +---+ | 1| +---+ 23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/03 15:56:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. Students and classes example schema = StructType([StructField('class', IntegerType(), True), StructField('student_id', IntegerType(), True), StructField('term', IntegerType(), True), StructField('subject', StringType(), True), StructField('marks', IntegerType(), True)]) df = spark.createDataFrame([[2, 1, 1, 'maths', 10], In [ ]: In [ ]:
  • 7. [2, 1, 2, 'maths', 12], [2, 1, 1, 'english', 14], [2, 1, 2, 'english', 12], [3, 2, 1, 'maths', 10], [3, 2, 2, 'maths', 12], [3, 2, 1, 'english', 16], [3, 2, 2, 'english', 14]], schema=schema) df.show() +-----+----------+----+-------+-----+ |class|student_id|term|subject|marks| +-----+----------+----+-------+-----+ | 2| 1| 1| maths| 10| | 2| 1| 2| maths| 12| | 2| 1| 1|english| 14| | 2| 1| 2|english| 12| | 3| 2| 1| maths| 10| | 3| 2| 2| maths| 12| | 3| 2| 1|english| 16| | 3| 2| 2|english| 14| +-----+----------+----+-------+-----+ Get class 2 students in following format --> class student_id subject term1 term2 df = df.orderBy('class', 'student_id', 'subject') .groupBy('class', 'student_id', 'subject') .agg(collect_list('marks')[0].alias('term1'), collect_list('marks')[1].alias('term2')) df.show() +-----+----------+-------+-----+-----+ |class|student_id|subject|term1|term2| +-----+----------+-------+-----+-----+ | 2| 1|english| 14| 12| | 2| 1| maths| 10| 12| | 3| 2|english| 16| 14| | 3| 2| maths| 10| 12| +-----+----------+-------+-----+-----+ df.filter(col('class') == 2).show() In [ ]: In [ ]: In [ ]: In [ ]:
  • 8. +-----+----------+-------+-----+-----+ |class|student_id|subject|term1|term2| +-----+----------+-------+-----+-----+ | 2| 1|english| 14| 12| | 2| 1| maths| 10| 12| +-----+----------+-------+-----+-----+ Get subject-wise aggregated score with 25% weightage to term1 and 75% weightage to term2 df = spark.createDataFrame([[2, 1, 1, 'maths', 10], [2, 1, 2, 'maths', 12], [2, 1, 1, 'english', 14], [2, 1, 2, 'english', 12], [3, 2, 1, 'maths', 10], [3, 2, 2, 'maths', 12], [3, 2, 1, 'english', 16], [3, 2, 2, 'english', 14]], schema=schema) df.show() +-----+----------+----+-------+-----+ |class|student_id|term|subject|marks| +-----+----------+----+-------+-----+ | 2| 1| 1| maths| 10| | 2| 1| 2| maths| 12| | 2| 1| 1|english| 14| | 2| 1| 2|english| 12| | 3| 2| 1| maths| 10| | 3| 2| 2| maths| 12| | 3| 2| 1|english| 16| | 3| 2| 2|english| 14| +-----+----------+----+-------+-----+ maths_agg = df.filter(col('subject') == 'maths') .orderBy('student_id', 'term') .groupBy('student_id') .agg((collect_list('marks')[0] * 0.25 + collect_list('marks')[1] * 0.75).alias('maths_agg')) english_agg = df.filter(col('subject') == 'english') .orderBy('student_id', 'term') .groupBy('student_id') .agg((collect_list('marks')[0] * 0.25 + collect_list('marks')[1] * 0.75).alias('english_agg')) maths_agg.join(english_agg, how='inner', on='student_id').show() In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 9. +----------+---------+-----------+ |student_id|maths_agg|english_agg| +----------+---------+-----------+ | 1| 11.5| 12.5| | 2| 11.5| 14.5| +----------+---------+-----------+ Exchange Seats - https://leetcode.com/problems/exchange-seats/description/ import pandas as pd import io data = ''' id,student 1,Abbot 2,Doris 3,Emerson 4,Green 5,Jeames ''' df = spark.createDataFrame(pd.read_csv(io.StringIO(data), header=0)) df.show() +---+-------+ | id|student| +---+-------+ | 1| Abbot| | 2| Doris| | 3|Emerson| | 4| Green| | 5| Jeames| +---+-------+ max_id = max(df.select('id').collect())[0] df.select(when(col('id') % 2 == 1, lead('id', 1, max_id).over(Window.orderBy('id'))) .otherwise(col('id') - 1).alias('id'), 'Student').orderBy('id').show() In [ ]: In [ ]: In [ ]:
  • 10. +---+-------+ | id|Student| +---+-------+ | 1| Doris| | 2| Abbot| | 3| Green| | 4|Emerson| | 5| Jeames| +---+-------+ 23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. 23/08/09 17:41:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradatio n. Tree Node https://leetcode.com/problems/tree-node/ schema = StructType([StructField('id', IntegerType(), False), StructField('p_id', IntegerType(), True)]) df = spark.createDataFrame( [[1, None], [2, 1], [3, 1], [4, 2], [5, 2]], schema=schema) df.show() +---+----+ | id|p_id| +---+----+ | 1|null| | 2| 1| | 3| 1| | 4| 2| | 5| 2| +---+----+ distinct_parent_ids = df.select( 'p_id').distinct().rdd.flatMap(lambda x: x).collect() In [ ]: In [ ]: In [ ]:
  • 11. df.select('id', when(col('p_id').isNull(), 'Root') .when(col('p_id').isNotNull() & col('id').isin(distinct_parent_ids), 'Inner') .otherwise('Leaf').alias('type')).show() +---+-----+ | id| type| +---+-----+ | 1| Root| | 2|Inner| | 3| Leaf| | 4| Leaf| | 5| Leaf| +---+-----+ Pandas pdf = df.toPandas() pdf id p_id 0 1 NaN 1 2 1.0 2 3 1.0 3 4 2.0 4 5 2.0 import numpy as np pdf['type'] = np.where(pdf['p_id'].isna(), 'Root', np.where(pdf['id'].isin(pdf['p_id'].unique()) & pdf['p_id'].notna(), 'Inner', 'Leaf')) pdf[['id', 'type']] In [ ]: In [ ]: Out[ ]: In [ ]: In [ ]:
  • 12. id type 0 1 Root 1 2 Inner 2 3 Leaf 3 4 Leaf 4 5 Leaf Customers who bought all products https://leetcode.com/problems/customers-who-bought-all-products/description/ customer_schema = StructType([StructField('customer_id', IntegerType(), False), StructField('product_key', IntegerType(), False)]) product_schema = StructType([StructField('product_key', IntegerType(), False)]) customer_df = spark.createDataFrame([[1, 5], [2, 6], [3, 5], [3, 6], [1, 6]], schema=customer_schema) product_df = spark.createDataFrame([[5], [6]], schema=product_schema) customer_df.show() +-----------+-----------+ |customer_id|product_key| +-----------+-----------+ | 1| 5| | 2| 6| | 3| 5| | 3| 6| | 1| 6| +-----------+-----------+ product_df.show() +-----------+ |product_key| +-----------+ | 5| | 6| +-----------+ Out[ ]: In [ ]: In [ ]: In [ ]:
  • 13. grouped_df = customer_df.dropDuplicates().groupBy( 'customer_id').agg(count(col('customer_id')).alias('count')) grouped_df.show() +-----------+-----+ |customer_id|count| +-----------+-----+ | 1| 2| | 3| 2| | 2| 1| +-----------+-----+ grouped_df.filter(col('count') == product_df.count()).select('customer_id').show() +-----------+ |customer_id| +-----------+ | 1| | 3| +-----------+ Triangle Judgement https://leetcode.com/problems/triangle-judgement/description/ schema = StructType([StructField('x', IntegerType(), False), StructField('y', IntegerType(), False), StructField('z', IntegerType(), False)]) df = spark.createDataFrame([[13, 15, 30], [10, 20, 15]], schema=schema) df.show() +---+---+---+ | x| y| z| +---+---+---+ | 13| 15| 30| | 10| 20| 15| +---+---+---+ df.withColumn('triangle', when((col('x') + col('y') > col('z')) & (col('y') + col('z') > col('x')) & (col('x') + col('z') > col('y')), 'Yes') .otherwise('No')).show() In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 14. +---+---+---+--------+ | x| y| z|triangle| +---+---+---+--------+ | 13| 15| 30| No| | 10| 20| 15| Yes| +---+---+---+--------+ Invalid Tweets https://leetcode.com/problems/invalid-tweets/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata schema = StructType([StructField('tweet_id', IntegerType(), False), StructField('content', StringType(), True)]) df = spark.createDataFrame([[1, 'Vote for Biden'], [2, 'Let us make America great again!']], schema=schema) df.show(truncate=False) +--------+--------------------------------+ |tweet_id|content | +--------+--------------------------------+ |1 |Vote for Biden | |2 |Let us make America great again!| +--------+--------------------------------+ df.filter(length(col('content')) > 15).select('tweet_id').show() +--------+ |tweet_id| +--------+ | 2| +--------+ Calculate Special Bonus https://leetcode.com/problems/calculate-special-bonus/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata schema = StructType([StructField('employee_id', IntegerType(), False), StructField('name', StringType(), True), StructField('salary', IntegerType(), True)]) df = spark.createDataFrame([[2, 'Meir', 3000], [3, 'Michael', 3800], [7, 'Addilyn', 7400], [8, 'Juan', 6100], [9, 'Kannon', 7700]], schema=schema) In [ ]: In [ ]: In [ ]: In [ ]:
  • 15. df.show(truncate=False) +-----------+-------+------+ |employee_id|name |salary| +-----------+-------+------+ |2 |Meir |3000 | |3 |Michael|3800 | |7 |Addilyn|7400 | |8 |Juan |6100 | |9 |Kannon |7700 | +-----------+-------+------+ df.withColumn('bonus', when((col('employee_id') % 2 == 1) & (~col('name').startswith( 'M')), col('salary')).otherwise(0).alias('bonus')).select('employee_id', 'bonus').show() +-----------+-----+ |employee_id|bonus| +-----------+-----+ | 2| 0| | 3| 0| | 7| 7400| | 8| 0| | 9| 7700| +-----------+-----+ Market Analysis I https://leetcode.com/problems/market-analysis-i/description/ users_schema = StructType([StructField('user_id', IntegerType(), False), StructField('join_date', StringType(), False), StructField('favorite_brand', StringType(), False)]) users_df = spark.createDataFrame([[1, '2018-01-01', 'Lenovo'], [2, '2018-02-09', 'Samsung'], [3, '2018-01-09', 'LG'], [4, '2018-05-21', 'HP']], schema=users_schema) orders_schema = StructType([StructField('order_id', IntegerType(), False), StructField('order_date', StringType(), False), StructField('item_id', IntegerType(), False), StructField('buyer_id', IntegerType(), False), StructField('seller_id', IntegerType(), False)]) orders_df = spark.createDataFrame([[1, '2019-08-01', 4, 1, 2], [2, '2018-08-02', 2, 1, 3], [3, '2019-08-03', 3, 2, 3], [4, '2018-08-04', 1, 4, 2], [5, '2018-08-04', 1, 3, 4], [6, '2019-08-05', 2, 2, 4]], schema=orders_schema) In [ ]: In [ ]: In [ ]:
  • 16. users_df = users_df.withColumn('join_date', col('join_date').cast('date')) orders_df = orders_df.withColumn('order_date', col('order_date').cast('date')) users_df.printSchema() root |-- user_id: integer (nullable = false) |-- join_date: date (nullable = true) |-- favorite_brand: string (nullable = false) orders_df.printSchema() root |-- order_id: integer (nullable = false) |-- order_date: date (nullable = true) |-- item_id: integer (nullable = false) |-- buyer_id: integer (nullable = false) |-- seller_id: integer (nullable = false) orders_temp = orders_df.filter(year(col('order_date')) == 2019).groupBy( col('buyer_id')).agg(count('order_id').alias('orders_in_2019')) orders_temp.show() +--------+--------------+ |buyer_id|orders_in_2019| +--------+--------------+ | 1| 1| | 2| 2| +--------+--------------+ users_df.join(orders_temp, users_df.user_id == orders_temp.buyer_id, how='left') .select(col('user_id').alias('buyer_id'), 'join_date', 'orders_in_2019') .fillna(0, subset=['orders_in_2019']).show() +--------+----------+--------------+ |buyer_id| join_date|orders_in_2019| +--------+----------+--------------+ | 1|2018-01-01| 1| | 2|2018-02-09| 2| | 3|2018-01-09| 0| | 4|2018-05-21| 0| +--------+----------+--------------+ Department Highest Salary https://leetcode.com/problems/department-highest-salary/ In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 17. employee_schema = StructType([StructField('id', IntegerType(), False), StructField('name', StringType(), False), StructField('salary', IntegerType(), False), StructField('departmentId', IntegerType(), False)]) employee_df = spark.createDataFrame([[1, 'Joe', 70000, 1], [2, 'Jim', 90000, 1], [3, 'Henry', 80000, 2], [4, 'Sam', 60000, 2], [5, 'Max', 90000, 1]], schema=employee_schema) department_schema = StructType([StructField('id', IntegerType(), False), StructField('name', StringType(), False)]) department_df = spark.createDataFrame([[1, 'IT'], [2, 'Sales']], schema=department_schema) employee_df.show() +---+-----+------+------------+ | id| name|salary|departmentId| +---+-----+------+------------+ | 1| Joe| 70000| 1| | 2| Jim| 90000| 1| | 3|Henry| 80000| 2| | 4| Sam| 60000| 2| | 5| Max| 90000| 1| +---+-----+------+------------+ department_df.show() +---+-----+ | id| name| +---+-----+ | 1| IT| | 2|Sales| +---+-----+ max_salary = employee_df.groupby(col('departmentId')).max( 'salary').withColumnRenamed('max(salary)', 'max_salary') max_salary.show() +------------+----------+ |departmentId|max_salary| +------------+----------+ | 1| 90000| | 2| 80000| +------------+----------+ In [ ]: In [ ]: In [ ]: In [ ]:
  • 18. max_salary = max_salary.withColumnRenamed('departmentId', 'depId') employee_df = employee_df.withColumnRenamed('name', 'empName') employee_df.join(max_salary, (employee_df['departmentId'] == max_salary['depId']) & (employee_df['salary'] == max_salary['max_salary']), how='inner') .join(department_df, employee_df['departmentId'] == department_df['id'], how='inner') .select(col('name').alias('Department'), col('empName').alias('Employee'), col('salary').alias('Salary')).show(truncate=False) +----------+--------+------+ |Department|Employee|Salary| +----------+--------+------+ |IT |Max |90000 | |IT |Jim |90000 | |Sales |Henry |80000 | +----------+--------+------+ The Number of Rich Customers https://leetcode.com/problems/the-number-of-rich-customers/description/?envType=study-plan-v2&envId=30-days-of-pandas&lang=pythondata import pandas as pd import io df = pd.read_csv(io.StringIO(''' bill_id,customer_id,amount 6,1,549 8,1,834 4,2,394 11,3,657 13,3,257 ''')) df bill_id customer_id amount 0 6 1 549 1 8 1 834 2 4 2 394 3 11 3 657 4 13 3 257 In [ ]: In [ ]: In [ ]: In [ ]: Out[ ]:
  • 19. df = spark.createDataFrame(df) df.show() +-------+-----------+------+ |bill_id|customer_id|amount| +-------+-----------+------+ | 6| 1| 549| | 8| 1| 834| | 4| 2| 394| | 11| 3| 657| | 13| 3| 257| +-------+-----------+------+ df.filter(col('amount') > 500).select(countDistinct(col('customer_id')).alias('rich_count')).show() +----------+ |rich_count| +----------+ | 2| +----------+ Project Employees I https://leetcode.com/problems/project-employees-i/description/ project_schema = StructType([StructField('project_id', IntegerType(), False), StructField('employee_id', IntegerType(), True)]) project_df = spark.createDataFrame([[1, 1], [1, 2], [1, 3], [2, 1], [2, 4]], schema=project_schema) employee_schema = StructType([StructField('employee_id', IntegerType(), False), StructField('name', StringType(), False), StructField('experience_years', IntegerType(), True)]) employee_df = spark.createDataFrame([[1, 'Khaled', 3], [2, 'Ali', 2], [3, 'John', 1], [4, 'Doe', 2]], schema=employee_schema) project_df.show() In [ ]: In [ ]: In [ ]: In [ ]: In [ ]:
  • 20. +----------+-----------+ |project_id|employee_id| +----------+-----------+ | 1| 1| | 1| 2| | 1| 3| | 2| 1| | 2| 4| +----------+-----------+ employee_df.show() +-----------+------+----------------+ |employee_id| name|experience_years| +-----------+------+----------------+ | 1|Khaled| 3| | 2| Ali| 2| | 3| John| 1| | 4| Doe| 2| +-----------+------+----------------+ project_df.join(employee_df, on='employee_id', how='inner') .groupBy('project_id').avg('experience_years') .select(col('project_id'), round(col('avg(experience_years)'), 2).alias('average_years')).show() +----------+-------------+ |project_id|average_years| +----------+-------------+ | 1| 2.0| | 2| 2.5| +----------+-------------+ Customer Who Visited but Did Not Make Any Transactions https://leetcode.com/problems/customer-who-visited-but-did-not-make-any-transactions/description/?envType=study-plan-v2&envId=top-sql-50 visits_schema = StructType([StructField('visit_id', IntegerType(), False), StructField('customer_id', IntegerType(), False)]) visits_df = spark.createDataFrame([[1, 23], [2, 9], [4, 30], [5, 54], [6, 96], [7, 54], [8, 54]], schema=visits_schema) transactions_schema = StructType([StructField('transaction_id', IntegerType(), False), StructField('visit_id', IntegerType(), False), In [ ]: In [ ]: In [ ]:
  • 21. StructField('amount', IntegerType(), False)]) transactions_df = spark.createDataFrame([[2, 5, 310], [3, 5, 300], [9, 5, 200], [12, 1, 910], [13, 2, 970]], schema=transactions_schema) visits_df.show() +--------+-----------+ |visit_id|customer_id| +--------+-----------+ | 1| 23| | 2| 9| | 4| 30| | 5| 54| | 6| 96| | 7| 54| | 8| 54| +--------+-----------+ transactions_df.show() +--------------+--------+------+ |transaction_id|visit_id|amount| +--------------+--------+------+ | 2| 5| 310| | 3| 5| 300| | 9| 5| 200| | 12| 1| 910| | 13| 2| 970| +--------------+--------+------+ If transactions_df is small in size - unique_visit_ids_who_transacted = [i[0] for i in transactions_df.select('visit_id').distinct().collect()] visits_df.filter(~visits_df['visit_id'].isin(unique_visit_ids_who_transacted)) .groupBy('customer_id').count() .withColumnRenamed('count', 'count_no_trans').show() +-----------+--------------+ |customer_id|count_no_trans| +-----------+--------------+ | 30| 1| | 96| 1| | 54| 2| +-----------+--------------+ In [ ]: In [ ]: In [ ]:
  • 22. If transactions_df is large to run collect(), we can use left join and picking records with null transaction_id visits_df.join(transactions_df, on='visit_id', how='left') .filter(col('transaction_id').isNull()) .groupBy('customer_id').agg(count('visit_id')) .withColumnRenamed('count(visit_id)', 'count_no_trans') .show() +-----------+--------------+ |customer_id|count_no_trans| +-----------+--------------+ | 54| 2| | 96| 1| | 30| 1| +-----------+--------------+ Product Price at a Given Date https://leetcode.com/problems/product-price-at-a-given-date/description/ product_schema = StructType([StructField('product_id', IntegerType(), False), StructField('new_price', IntegerType(), True), StructField('change_date', StringType(), True)]) product_df = spark.createDataFrame([[1, 20, '2019-08-14'], [2, 50, '2019-08-14'], [1, 30, '2019-08-15'], [1, 35, '2019-08-16'], [2, 65, '2019-08-17'], [3, 20, '2019-08-18']], schema=product_schema).select('product_id', 'new_price', col('change_date').cast('date')) product_df.show() +----------+---------+-----------+ |product_id|new_price|change_date| +----------+---------+-----------+ | 1| 20| 2019-08-14| | 2| 50| 2019-08-14| | 1| 30| 2019-08-15| | 1| 35| 2019-08-16| | 2| 65| 2019-08-17| | 3| 20| 2019-08-18| +----------+---------+-----------+ from pyspark.sql.functions import min as min_ ## note - it is important to alias the pyspark min function since otherwise it would use python min() function and break the code In [ ]: In [ ]: In [ ]: In [ ]:
  • 23. product_df.filter(col('change_date') <= '2019-08-16') .select('product_id', first('new_price').over(Window.partitionBy('product_id').orderBy(col('change_date').desc())).alias('price')) .union( product_df.groupBy('product_id').agg(min_('change_date').alias('change_date')) .filter(col('change_date') > '2019-08-16').withColumn('price', lit(10)).select('product_id', 'price') ).dropDuplicates().show() +----------+-----+ |product_id|price| +----------+-----+ | 2| 50| | 1| 35| | 3| 10| +----------+-----+ In [ ]: