0%

使用pandas 对比csv 文件

通过pandas 的compare function, 可以对比两个csv 文件
用途,例如升级或修改code之后,输出是csv文本文件,对于同样input的数据,预期应该一样
input -> program -> output
input -> program(optimize) -> output_new
expect output == output_new

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd

# Sample data
data_control = {
'ProductEntityId': [1, 2, 3, 4, 5],
'ColumnA': [10, 20, 30, 40, 50],
'ColumnB': ['A', 'B', 'C', 'D', 'E']
}

data_treatment = {
'ProductEntityId': [1, 2, 3, 4],
'ColumnA': [10, 25, 30, 40],
'ColumnB': ['A', 'X', 'C', 'D']
}

df_control = pd.DataFrame(data_control)
df_treatment = pd.DataFrame(data_treatment)

# Set ProductEntityId as the index
df_control.set_index('ProductEntityId', inplace=True)
df_treatment.set_index('ProductEntityId', inplace=True)

# Find the intersection of the indices
common_indices = df_control.index.intersection(df_treatment.index)

# Filter both dataframes to only include the common indices
df_control_common = df_control.loc[common_indices]
df_treatment_common = df_treatment.loc[common_indices]

# Ensure both dataframes have the same columns
df_treatment_common = df_treatment_common[df_control_common.columns]

# Compare the dataframes
comparison_df = df_control_common.compare(df_treatment_common, keep_shape=True, keep_equal=True)

# Highlight the differences
first_level_columns = comparison_df.columns.get_level_values(0).unique()
def highlight_differences(row):
styles = []
for col in comparison_df.columns.levels[0]:
self_val = row[(col, 'self')]
other_val = row[(col, 'other')]
if (~pd.isna(self_val) or ~pd.isna(other_val)) & (self_val != other_val):
styles.append('background-color: yellow')
styles.append('background-color: yellow')
else:
styles.append('')
styles.append('')
return styles
highlighted_df = comparison_df.style.apply(highlight_differences, axis=1)

# Display the highlighted dataframe
# highlighted_df
# write hightlighted dataframe to a file
highlighted_df.to_excel("./highlighted_output.xlsx", engine='openpyxl', index=True)