pyspark,pandas,sql之分組排序

# coding=utf-8

import pandas as pd
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as fn


pdf = pd.DataFrame({
    'project': ['A', 'A', 'A', 'B', 'B', 'B'],
    'grade': ['D', 'D', 'C', 'C', 'E', 'E'],
    'score': ['K','J', 'I','H', 'G', 'F'],
})
print(pdf)

## pyspark:
spark = SparkSession.builder.master('local').appName('rank_app').getOrCreate()
df = spark.createDataFrame(pdf)
df = df.withColumn('row_num', fn.row_number().over(Window.partitionBy('project')\
                                                   .orderBy(fn.desc('grade'), fn.asc('score'))))
df.orderBy(fn.asc('project'), fn.desc('score')).show()
spark.stop()


## sql:
'''
SELECT project, grade, score
    , ROW_NUMBER() OVER (PARTITION BY project ORDER BY grade DESC, score ASC) AS row_num
FROM table
ORDER BY project, score DESC
'''

## pandas 1:
# for col in list(pdf.columns):
#     hash_table = {v:i+1 for i,v in enumerate(sorted(pdf[col].unique()))}
#     pdf[f'nk_{col}'] = pdf[col].apply(lambda x: hash_table.get(x))
#
# # desc:
# pdf['neg_nk_grade'] = pdf['nk_grade'] * (-1)
# # rank:
# pdf['row_num'] = pdf[['project','neg_nk_grade','nk_score']].groupby(['project']).rank(ascending=True,method='first',na_option='bottom')
# pdf = pdf[['project','grade','score','rank_num']]
# print(pdf)

## error ~~~
## 只適合分組之後的一列排序,不能進行多列排序,更不能多列正逆序
## 非數字列不能進行排序
## 非數字列在numeric_only=False下,不能使用method='first',shit!

## pandas 2:
pdf['row_num'] = pdf.sort_values(['grade','score'], ascending=[False,True]).groupby(['project']).cumcount()+1
print(pdf)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章