Python数据分析入门：Pandas与NumPy实战

适用人群：数据分析师、Python初学者
目标：掌握Python数据分析基础工具
前置知识：Python基础语法

为什么选择Python进行数据分析？

Python在数据分析领域占据主导地位，主要原因包括：

丰富的库生态：NumPy、Pandas、Matplotlib、Scikit-learn等
易于学习：语法简洁，上手快
社区活跃：大量文档、教程和解决方案
可扩展性强：可与其他语言（C/C++）集成
通用性强：从数据清洗到机器学习一站式解决

NumPy基础

什么是NumPy？

NumPy是Python科学计算的基础库，提供了高性能的多维数组对象和数学函数。

安装NumPy

pip install numpy

创建数组

import numpy as np

# 从列表创建
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1)  # [1 2 3 4 5]

# 创建二维数组
arr2 = np.array([[1, 2, 3], [4, 5, 6]])
print(arr2)
# [[1 2 3]
#  [4 5 6]]

# 创建全零数组
zeros = np.zeros((3, 4))
print(zeros.shape)  # (3, 4)

# 创建全一数组
ones = np.ones((2, 3))

# 创建单位矩阵
identity = np.eye(3)

# 创建随机数组
random_arr = np.random.rand(3, 3)

# 创建等差数列
range_arr = np.arange(0, 10, 2)  # [0, 2, 4, 6, 8]

# 创建等分数列
linspace_arr = np.linspace(0, 1, 5)  # [0.  , 0.25, 0.5 , 0.75, 1.  ]

数组运算

import numpy as np

a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

# 基本运算
print(a + b)   # [ 6  8 10 12]
print(a - b)   # [-4 -4 -4 -4]
print(a * b)   # [ 5 12 21 32]
print(a / b)   # [0.2  0.33 0.43 0.5 ]

# 标量运算
print(a * 2)   # [2 4 6 8]
print(a + 10)  # [11 12 13 14]

# 数学函数
print(np.sqrt(a))    # 平方根
print(np.exp(a))     # 指数
print(np.log(a))     # 自然对数
print(np.sin(a))     # 正弦

# 统计函数
print(np.sum(a))     # 求和: 10
print(np.mean(a))    # 平均值: 2.5
print(np.std(a))     # 标准差
print(np.max(a))     # 最大值: 4
print(np.min(a))     # 最小值: 1

数组索引和切片

import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# 索引
print(arr[0, 0])     # 1
print(arr[1, 2])     # 6

# 切片
print(arr[0, :])     # [1 2 3] - 第一行
print(arr[:, 0])     # [1 4 7] - 第一列
print(arr[0:2, 1:3]) # [[2 3] [5 6]]

# 布尔索引
print(arr[arr > 5])  # [6 7 8 9]

# 花式索引
print(arr[[0, 2], [1, 2]])  # [2 9]

Pandas基础

什么是Pandas？

Pandas是基于NumPy的数据分析库，提供了DataFrame和Series两种核心数据结构，是数据分析的利器。

安装Pandas

pip install pandas

Series

Series是一维标签数组，类似于带索引的数组。

import pandas as pd

# 创建Series
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(s)

# 从字典创建
data = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(data)

# 访问元素
print(s['a'])      # 1
print(s[0])        # 1
print(s['a':'c'])  # 切片

# 运算
print(s * 2)
print(s + 10)

# 统计
print(s.mean())    # 平均值
print(s.sum())     # 求和
print(s.describe()) # 描述性统计

DataFrame

DataFrame是二维表格数据结构，类似于Excel表格或SQL表。

import pandas as pd

# 从字典创建
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'city': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)

# 从CSV文件读取
df = pd.read_csv('data.csv')

# 从Excel文件读取
df = pd.read_excel('data.xlsx')

# 查看数据
print(df.head())      # 前5行
print(df.tail())      # 后5行
print(df.info())      # 数据信息
print(df.describe())  # 统计摘要
print(df.shape)       # 行列数
print(df.columns)     # 列名

数据选择和过滤

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'city': ['New York', 'London', 'Paris', 'Tokyo']
})

# 选择列
print(df['name'])           # 单列
print(df[['name', 'age']])  # 多列

# 选择行
print(df.loc[0])            # 按标签选择
print(df.iloc[0])           # 按位置选择
print(df.loc[0:2])          # 切片

# 条件过滤
print(df[df['age'] > 25])
print(df[(df['age'] > 25) & (df['city'] == 'London')])
print(df[df['name'].str.contains('A')])

# 使用isin
print(df[df['city'].isin(['London', 'Paris'])])

数据清洗

import pandas as pd
import numpy as np

# 创建包含缺失值的数据
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})

# 检查缺失值
print(df.isnull())
print(df.isnull().sum())

# 删除缺失值
print(df.dropna())           # 删除包含缺失值的行
print(df.dropna(axis=1))     # 删除包含缺失值的列
print(df.dropna(thresh=2))   # 至少保留2个非缺失值

# 填充缺失值
print(df.fillna(0))                    # 用0填充
print(df.fillna(df.mean()))            # 用平均值填充
print(df.fillna(method='ffill'))       # 前向填充
print(df.fillna(method='bfill'))       # 后向填充

# 删除重复值
df = df.drop_duplicates()

# 替换值
df = df.replace({'A': {1: 100, 2: 200}})

数据操作

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'London', 'Paris']
})

# 添加列
df['country'] = ['USA', 'UK', 'France']
df['age_next_year'] = df['age'] + 1

# 删除列
df = df.drop('age_next_year', axis=1)
df.drop('age_next_year', axis=1, inplace=True)

# 重命名列
df = df.rename(columns={'name': '姓名', 'age': '年龄'})

# 排序
df = df.sort_values('age')              # 升序
df = df.sort_values('age', ascending=False)  # 降序

# 分组
grouped = df.groupby('city')
print(grouped['age'].mean())  # 按城市分组计算平均年龄

# 应用函数
df['age_category'] = df['age'].apply(lambda x: 'Young' if x < 30 else 'Old')

数据聚合

import pandas as pd

df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'B', 'A'],
    'value': [10, 20, 30, 40, 50]
})

# 分组聚合
print(df.groupby('category').sum())
print(df.groupby('category').mean())
print(df.groupby('category').agg(['sum', 'mean', 'count']))

# 多列分组
df.groupby(['category', 'value']).size()

# 数据透视表
pivot = df.pivot_table(
    values='value',
    index='category',
    aggfunc='sum'
)

数据合并

import pandas as pd

df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

# 合并（类似SQL JOIN）
print(pd.merge(df1, df2, on='key', how='inner'))  # 内连接
print(pd.merge(df1, df2, on='key', how='outer'))  # 外连接
print(pd.merge(df1, df2, on='key', how='left'))   # 左连接
print(pd.merge(df1, df2, on='key', how='right'))  # 右连接

# 拼接
df3 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df4 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

print(pd.concat([df3, df4], axis=0))  # 纵向拼接
print(pd.concat([df3, df4], axis=1))  # 横向拼接

数据可视化

使用Matplotlib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 准备数据
x = np.linspace(0, 10, 100)
y = np.sin(x)

# 创建图形
plt.figure(figsize=(10, 6))
plt.plot(x, y, label='sin(x)')
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.title('正弦函数')
plt.legend()
plt.grid(True)
plt.show()

# 柱状图
df = pd.DataFrame({
    'category': ['A', 'B', 'C', 'D'],
    'value': [10, 25, 15, 30]
})
df.plot(kind='bar', x='category', y='value')
plt.show()

# 散点图
plt.scatter(df['category'], df['value'])
plt.show()

# 直方图
data = np.random.randn(1000)
plt.hist(data, bins=30)
plt.show()

使用Pandas内置绘图

import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 3, 4, 5, 6]
})

# 折线图
df.plot()
plt.show()

# 柱状图
df.plot(kind='bar')
plt.show()

# 面积图
df.plot(kind='area')
plt.show()

# 箱线图
df.plot(kind='box')
plt.show()

实战案例：分析销售数据

import pandas as pd
import matplotlib.pyplot as plt

# 创建示例数据
data = {
    'date': pd.date_range('2024-01-01', periods=100),
    'product': ['A', 'B', 'C', 'D'] * 25,
    'sales': [100 + i * 5 + (i % 10) * 10 for i in range(100)],
    'region': ['North', 'South', 'East', 'West'] * 25
}
df = pd.DataFrame(data)

# 数据概览
print("数据概览:")
print(df.head())
print("\n数据信息:")
print(df.info())
print("\n统计摘要:")
print(df.describe())

# 按产品分组统计
product_sales = df.groupby('product')['sales'].agg(['sum', 'mean', 'count'])
print("\n产品销售统计:")
print(product_sales)

# 按地区分组统计
region_sales = df.groupby('region')['sales'].sum()
print("\n地区销售统计:")
print(region_sales)

# 可视化
plt.figure(figsize=(12, 5))

# 产品销售柱状图
plt.subplot(1, 2, 1)
product_sales['sum'].plot(kind='bar', color='skyblue')
plt.title('各产品总销售额')
plt.xlabel('产品')
plt.ylabel('销售额')

# 地区销售饼图
plt.subplot(1, 2, 2)
region_sales.plot(kind='pie', autopct='%1.1f%%')
plt.title('各地区销售占比')

plt.tight_layout()
plt.show()

# 时间序列分析
df['month'] = df['date'].dt.month
monthly_sales = df.groupby('month')['sales'].sum()
monthly_sales.plot(kind='line', marker='o')
plt.title('月度销售趋势')
plt.xlabel('月份')
plt.ylabel('销售额')
plt.grid(True)
plt.show()

最佳实践

1. 使用向量化操作

# 不好的做法 - 使用循环
result = []
for i in range(len(df)):
    result.append(df['value'][i] * 2)

# 好的做法 - 向量化
result = df['value'] * 2

2. 合理使用内存

# 查看内存使用
print(df.memory_usage())

# 指定数据类型
df['category'] = df['category'].astype('category')

# 只读取需要的列
df = pd.read_csv('data.csv', usecols=['col1', 'col2'])

3. 链式操作

# 链式操作使代码更清晰
result = (df
    .query('age > 25')
    .groupby('city')
    .agg({'salary': 'mean'})
    .sort_values('salary', ascending=False)
    .head(10)
)

总结

Python数据分析生态系统强大而完善，NumPy和Pandas是数据分析的基础工具。掌握这两个库，能够高效地处理和分析数据。

关键要点：

NumPy提供高性能数组运算
Pandas提供灵活的数据结构和数据分析工具
数据清洗是数据分析的重要步骤
可视化帮助更好地理解数据

码上笔记