import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
import logging
from abc import ABC, abstractmethod
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import hashlib
import uuid
import json
from concurrent.futures import ThreadPoolExecutor
from functools import partial
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('AdAttributionSystem')
@dataclass
class TouchPoint:
"""广告触点数据结构"""
id: str
user_id: str
channel: str
campaign_id: str
creative_id: str
timestamp: datetime
cost: float
event_type: str # impression, click, conversion
conversion_value: Optional[float] = None
device_type: Optional[str] = None
location: Optional[str] = None
@dataclass
class UserJourney:
"""用户旅程数据结构"""
user_id: str
touch_points: List[TouchPoint]
conversion_value: float
conversion_time: datetime
path_length: int = 0
time_to_convert: timedelta = timedelta(0)
class AttributionModel(ABC):
"""归因模型抽象基类"""
@abstractmethod
def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
"""分配转化功劳给各个触点"""
pass
class LastClickModel(AttributionModel):
"""最后一次点击归因模型"""
def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
"""将100%功劳归于最后一次点击"""
attribution = defaultdict(float)
for journey in journeys:
if not journey.touch_points:
continue
# 找到最后一次点击
last_click = None
for point in reversed(journey.touch_points):
if point.event_type == 'click':
last_click = point
break
if last_click:
attribution_key = self._get_attribution_key(last_click)
attribution[attribution_key] += journey.conversion_value
return dict(attribution)
def _get_attribution_key(self, touch_point: TouchPoint) -> str:
"""生成归因键(可根据需要调整)"""
return f"{touch_point.channel}:{touch_point.campaign_id}"
class LinearModel(AttributionModel):
"""线性归因模型"""
def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
"""将功劳平均分配给所有触点"""
attribution = defaultdict(float)
for journey in journeys:
if not journey.touch_points:
continue
# 计算每个触点的功劳
touch_points = [p for p in journey.touch_points if p.event_type in ['click', 'impression']]
value_per_point = journey.conversion_value / len(touch_points) if touch_points else 0
for point in touch_points:
attribution_key = self._get_attribution_key(point)
attribution[attribution_key] += value_per_point
return dict(attribution)
def _get_attribution_key(self, touch_point: TouchPoint) -> str:
return f"{touch_point.channel}:{touch_point.campaign_id}:{touch_point.creative_id}"
class TimeDecayModel(AttributionModel):
"""时间衰减归因模型"""
def __init__(self, half_life: timedelta = timedelta(hours=12)):
self.half_life = half_life
logger.info(f"时间衰减模型初始化 - 半衰期: {half_life}")
def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
"""根据时间衰减分配功劳"""
attribution = defaultdict(float)
for journey in journeys:
if not journey.touch_points:
continue
# 过滤有效触点
touch_points = [p for p in journey.touch_points if p.event_type in ['click', 'impression']]
if not touch_points:
continue
# 计算每个触点的权重
weights = self._calculate_weights(
touch_points,
journey.conversion_time
)
total_weight = sum(weights)
if total_weight <= 0:
continue
# 分配功劳
for point, weight in zip(touch_points, weights):
attribution_key = self._get_attribution_key(point)
attribution[attribution_key] += journey.conversion_value * (weight / total_weight)
return dict(attribution)
def _calculate_weights(self, touch_points: List[TouchPoint], conversion_time: datetime) -> List[float]:
"""计算时间衰减权重"""
weights = []
for point in touch_points:
time_diff = conversion_time - point.timestamp
half_lives = time_diff / self.half_life
weight = 0.5 ** half_lives.total_seconds() / (3600 * 12) # 转换为半衰期单位
weights.append(weight)
return weights
def _get_attribution_key(self, touch_point: TouchPoint) -> str:
return f"{touch_point.channel}:{touch_point.campaign_id}"
class DataDrivenModel(AttributionModel):
"""数据驱动归因模型(马尔可夫链)"""
def __init__(self, n_simulations: int = 10000):
self.n_simulations = n_simulations
self.transition_counts = defaultdict(int)
self.removal_effects = {}
logger.info("数据驱动归因模型初始化")
def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
"""使用马尔可夫链计算移除效应"""
if not journeys:
return {}
# 1. 构建状态转移矩阵
self._build_transition_matrix(journeys)
# 2. 计算基准转化率
baseline_conversion = self._calculate_baseline_conversion(journeys)
if baseline_conversion <= 0:
return {}
# 3. 计算每个渠道的移除效应
channel_effects = {}
all_channels = self._get_all_channels(journeys)
for channel in all_channels:
removal_conversion = self._calculate_removal_conversion(journeys, channel)
effect = (baseline_conversion - removal_conversion) / baseline_conversion
channel_effects[channel] = max(0, effect) # 确保非负
# 4. 归一化分配功劳
total_effect = sum(channel_effects.values())
if total_effect <= 0:
return {}
attribution = {
channel: (effect / total_effect) * sum(j.conversion_value for j in journeys)
for channel, effect in channel_effects.items()
}
return attribution
def _build_transition_matrix(self, journeys: List[UserJourney]):
"""构建状态转移矩阵"""
self.transition_counts.clear()
for journey in journeys:
path = self._get_path(journey)
if len(path) < 2:
continue
# 记录状态转移
for i in range(len(path) - 1):
from_state, to_state = path[i], path[i+1]
self.transition_counts[(from_state, to_state)] += 1
# 记录转换状态
last_state = path[-1]
self.transition_counts[(last_state, 'conversion')] += 1
def _get_path(self, journey: UserJourney) -> List[str]:
"""获取用户路径"""
touch_points = sorted(journey.touch_points, key=lambda x: x.timestamp)
return [f"{point.channel}:{point.event_type}" for point in touch_points]
def _calculate_baseline_conversion(self, journeys: List[UserJourney]) -> float:
"""计算基准转化率"""
total_paths = 0
converting_paths = 0
for journey in journeys:
path = self._get_path(journey)
if not path:
continue
total_paths += 1
if journey.conversion_value > 0:
converting_paths += 1
return converting_paths / total_paths if total_paths > 0 else 0
def _get_all_channels(self, journeys: List[UserJourney]) -> Set[str]:
"""获取所有唯一渠道"""
channels = set()
for journey in journeys:
for point in journey.touch_points:
channels.add(point.channel)
return channels
def _calculate_removal_conversion(self, journeys: List[UserJourney], channel: str) -> float:
"""计算移除某渠道后的转化率"""
total_paths = 0
converting_paths = 0
for journey in journeys:
path = self._get_path(journey)
if not path:
continue
# 检查路径是否包含该渠道
has_channel = any(f"{channel}:" in state for state in path)
if not has_channel:
# 路径不受影响
total_paths += 1
if journey.conversion_value > 0:
converting_paths += 1
continue
# 模拟移除该渠道后的路径
modified_path = [state for state in path if f"{channel}:" not in state]
if not modified_path:
# 路径完全被移除
continue
# 检查修改后的路径是否仍然有效
total_paths += 1
if self._simulate_conversion(modified_path):
converting_paths += 1
return converting_paths / total_paths if total_paths > 0 else 0
def _simulate_conversion(self, path: List[str]) -> bool:
"""模拟路径的转化概率"""
if not path:
return False
# 简化版模拟 - 实际应使用马尔可夫链计算
last_state = path[-1]
conversion_count = self.transition_counts.get((last_state, 'conversion'), 0)
total_transitions = sum(
count for (from_state, to_state), count in self.transition_counts.items()
if from_state == last_state
)
if total_transitions == 0:
return False
conversion_prob = conversion_count / total_transitions
return random.random() < conversion_prob
class AdAttributionAnalyzer:
"""广告归因分析系统"""
def __init__(self, models: Dict[str, AttributionModel]):
self.models = models
self.user_journeys = []
logger.info("广告归因分析系统初始化")
def add_touch_points(self, touch_points: List[TouchPoint]):
"""添加广告触点数据并构建用户旅程"""
# 按用户ID分组
user_points = defaultdict(list)
for point in touch_points:
user_points[point.user_id].append(point)
# 构建用户旅程
new_journeys = []
for user_id, points in user_points.items():
# 找到转化事件
conversions = [p for p in points if p.event_type == 'conversion']
if not conversions:
continue
# 假设每个用户旅程只有一个转化
conversion = conversions[0]
other_points = [p for p in points if p.event_type != 'conversion']
# 按时间排序触点
sorted_points = sorted(other_points, key=lambda x: x.timestamp)
# 计算路径特征
path_length = len(sorted_points)
time_to_convert = conversion.timestamp - sorted_points[0].timestamp if sorted_points else timedelta(0)
journey = UserJourney(
user_id=user_id,
touch_points=sorted_points,
conversion_value=conversion.conversion_value or 0,
conversion_time=conversion.timestamp,
path_length=path_length,
time_to_convert=time_to_convert
)
new_journeys.append(journey)
self.user_journeys.extend(new_journeys)
logger.info(f"添加 {len(new_journeys)} 条用户旅程,总计 {len(self.user_journeys)} 条")
def run_attribution(self) -> Dict[str, Dict[str, float]]:
"""运行所有归因模型"""
results = {}
for model_name, model in self.models.items():
logger.info(f"运行归因模型: {model_name}")
start_time = time.time()
attribution = model.attribute_conversions(self.user_journeys)
results[model_name] = attribution
elapsed = time.time() - start_time
logger.info(f"完成 {model_name} 归因 - 耗时 {elapsed:.2f}秒")
return results
def analyze_journeys(self) -> Dict:
"""分析用户旅程特征"""
if not self.user_journeys:
return {}
# 基本统计
num_journeys = len(self.user_journeys)
total_conversion_value = sum(j.conversion_value for j in self.user_journeys)
avg_path_length = np.mean([j.path_length for j in self.user_journeys])
avg_time_to_convert = np.mean([j.time_to_convert.total_seconds() for j in self.user_journeys]) / 3600 # 转换为小时
# 渠道参与度
channel_participation = defaultdict(int)
for journey in self.user_journeys:
channels = set(p.channel for p in journey.touch_points)
for channel in channels:
channel_participation[channel] += 1
# 路径模式聚类
journey_features = self._extract_journey_features()
if len(journey_features) > 5: # 有足够数据时才聚类
cluster_labels = self._cluster_journeys(journey_features)
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
cluster_info = dict(zip(unique_clusters, cluster_counts))
else:
cluster_info = {}
return {
'num_journeys': num_journeys,
'total_conversion_value': total_conversion_value,
'avg_path_length': avg_path_length,
'avg_time_to_convert': avg_time_to_convert,
'channel_participation': dict(channel_participation),
'journey_clusters': cluster_info
}
def _extract_journey_features(self) -> List[Dict]:
"""提取用户旅程特征"""
features = []
channel_set = self._get_all_channels()
for journey in self.user_journeys:
# 基本特征
feat = {
'path_length': journey.path_length,
'time_to_convert': journey.time_to_convert.total_seconds(),
'conversion_value': journey.conversion_value
}
# 渠道存在特征
journey_channels = set(p.channel for p in journey.touch_points)
for channel in channel_set:
feat[f'channel_{channel}'] = 1 if channel in journey_channels else 0
features.append(feat)
return features
def _get_all_channels(self) -> Set[str]:
"""获取所有唯一渠道"""
channels = set()
for journey in self.user_journeys:
for point in journey.touch_points:
channels.add(point.channel)
return channels
def _cluster_journeys(self, features: List[Dict]) -> np.ndarray:
"""对用户旅程进行聚类"""
# 转换为DataFrame
df = pd.DataFrame(features)
# 标准化特征
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
# 确定最佳聚类数
max_clusters = min(10, len(features) - 1)
best_score = -1
best_k = 2
for k in range(2, max_clusters + 1):
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(scaled_features)
score = silhouette_score(scaled_features, labels)
if score > best_score:
best_score = score
best_k = k
# 使用最佳k值进行最终聚类
final_kmeans = KMeans(n_clusters=best_k, random_state=42)
labels = final_kmeans.fit_predict(scaled_features)
logger.info(f"完成用户旅程聚类 - 最佳k值: {best_k}, 轮廓系数: {best_score:.2f}")
return labels
def visualize_attribution(self, attribution_results: Dict[str, Dict[str, float]]):
"""可视化归因结果"""
if not attribution_results:
return
# 准备数据
data = []
for model_name, attributions in attribution_results.items():
for channel, value in attributions.items():
data.append({
'Model': model_name,
'Channel': channel.split(':')[0], # 只取渠道名
'Attributed Value': value
})
df = pd.DataFrame(data)
# 绘制堆叠柱状图
plt.figure(figsize=(12, 6))
sns.barplot(
data=df,
x='Channel',
y='Attributed Value',
hue='Model',
estimator=sum,
ci=None
)
plt.title('各渠道归因价值对比')
plt.ylabel('归因价值')
plt.xlabel('广告渠道')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 绘制归一化百分比图
plt.figure(figsize=(12, 6))
norm_df = df.groupby(['Model', 'Channel']).sum().groupby(level=0).apply(
lambda x: 100 * x / x.sum()
).reset_index()
sns.barplot(
data=norm_df,
x='Channel',
y='Attributed Value',
hue='Model'
)
plt.title('各渠道归因价值占比(%)')
plt.ylabel('归因价值占比')
plt.xlabel('广告渠道')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 示例使用
if __name__ == "__main__":
# 1. 初始化归因模型
models = {
'last_click': LastClickModel(),
'linear': LinearModel(),
'time_decay': TimeDecayModel(half_life=timedelta(hours=24)),
'data_driven': DataDrivenModel(n_simulations=5000)
}
analyzer = AdAttributionAnalyzer(models)
# 2. 生成模拟数据
def generate_mock_data(num_users: int = 1000) -> List[TouchPoint]:
channels = ['google', 'facebook', 'instagram', 'twitter', 'email']
campaigns = [f'campaign_{i}' for i in range(1, 6)]
devices = ['mobile', 'desktop', 'tablet']
locations = ['US', 'UK', 'CA', 'AU', 'DE']
touch_points = []
for user_id in range(1, num_users + 1):
user_id = str(uuid.uuid4())
# 决定用户是否转化
will_convert = random.random() < 0.3 # 30%转化率
# 生成触点序列
num_touch_points = random.randint(1, 8)
touch_point_sequence = []
for i in range(num_touch_points):
channel = random.choice(channels)
campaign = random.choice(campaigns)
device = random.choice(devices)
location = random.choice(locations)
# 随机决定事件类型
if i == num_touch_points - 1 and will_convert:
event_type = 'conversion'
conversion_value = random.uniform(10, 100)
else:
event_type = random.choices(
['impression', 'click'],
weights=[0.7, 0.3],
k=1
)[0]
conversion_value = None
# 生成时间戳(模拟几天内的活动)
timestamp = datetime.now() - timedelta(
days=random.uniform(0, 7),
hours=random.uniform(0, 24)
)
# 生成成本(仅对click和impression)
cost = random.uniform(0.1, 2.0) if event_type != 'conversion' else 0
touch_point = TouchPoint(
id=str(uuid.uuid4()),
user_id=user_id,
channel=channel,
campaign_id=campaign,
creative_id=f"creative_{random.randint(1, 10)}",
timestamp=timestamp,
cost=cost,
event_type=event_type,
conversion_value=conversion_value,
device_type=device,
location=location
)
touch_point_sequence.append(touch_point)
touch_points.extend(touch_point_sequence)
return touch_points
logger.info("生成模拟数据...")
mock_data = generate_mock_data(2000)
analyzer.add_touch_points(mock_data)
# 3. 分析用户旅程
journey_analysis = analyzer.analyze_journeys()
print("\n用户旅程分析结果:")
print(f"总旅程数: {journey_analysis['num_journeys']}")
print(f"总转化价值: ${journey_analysis['total_conversion_value']:.2f}")
print(f"平均路径长度: {journey_analysis['avg_path_length']:.1f}")
print(f"平均转化时间: {journey_analysis['avg_time_to_convert']:.1f} 小时")
print("\n渠道参与度:")
for channel, count in journey_analysis['channel_participation'].items():
print(f"{channel}: {count} 次")
# 4. 运行归因分析
logger.info("运行归因分析...")
attribution_results = analyzer.run_attribution()
print("\n归因分析结果:")
for model_name, attributions in attribution_results.items():
print(f"\n{model_name} 模型:")
for channel, value in attributions.items():
print(f"{channel}: ${value:.2f}")
# 5. 可视化结果
analyzer.visualize_attribution(attribution_results)
使用说明
功能特点
-
多模型归因分析:
- 最后一次点击归因
- 线性归因
- 时间衰减归因
- 数据驱动(马尔可夫链)归因
-
用户旅程分析:
- 路径长度统计
- 转化时间分析
- 渠道参与度计算
- 旅程模式聚类
-
高级分析功能:
- 渠道移除效应计算
- 转化概率模拟
- 多维度特征提取
-
可视化支持:
- 归因价值对比图
- 渠道贡献百分比
- 旅程聚类分析
核心组件
-
TouchPoint:
- 广告触点数据模型
- 包含渠道、时间、成本等元数据
-
UserJourney:
- 用户转化路径模型
- 包含触点序列和转化信息
-
AttributionModel:
- 归因模型抽象基类
- 各具体模型实现
-
AdAttributionAnalyzer:
- 系统主控制器
- 数据管理与分析入口
使用方法
-
初始化系统:
models = { 'last_click': LastClickModel(), 'linear': LinearModel(), 'time_decay': TimeDecayModel(), 'data_driven': DataDrivenModel() } analyzer = AdAttributionAnalyzer(models)
-
导入数据:
# 从数据库或文件加载触点数据 touch_points = load_touch_points() analyzer.add_touch_points(touch_points)
-
运行分析:
# 分析用户旅程特征 journey_analysis = analyzer.analyze_journeys() # 运行归因模型 attribution_results = analyzer.run_attribution()
-
可视化结果:
analyzer.visualize_attribution(attribution_results)
-
获取洞察:
# 获取最佳归因模型的结果 data_driven_results = attribution_results['data_driven'] # 识别高价值渠道 top_channels = sorted(data_driven_results.items(), key=lambda x: -x[1])[:3]
应用场景
-
营销效果评估:
- 准确衡量各渠道贡献
- 优化营销预算分配
-
用户行为分析:
- 理解典型转化路径
- 识别关键触点模式
-
策略优化:
- 基于数据驱动的渠道优化
- 调整触点顺序和时间
-
ROI计算:
- 结合成本数据计算真实ROI
- 识别高效低耗渠道
技术亮点
-
马尔可夫链模型:
- 计算渠道移除效应
- 模拟路径转化概率
-
机器学习聚类:
- 自动发现旅程模式
- 轮廓系数确定最佳聚类数
-
高效数据处理:
- 用户旅程构建算法
- 并行计算支持
-
交互式可视化:
- 多维度数据对比
- 直观呈现归因差异
扩展建议
-
实时数据处理:
- 流式数据接入
- 实时归因计算
-
跨设备归因:
- 用户身份识别
- 设备图集成
-
预测模型:
- 转化概率预测
- 预算分配优化
-
异常检测:
- 识别异常路径
- 作弊行为检测
这个系统可以帮助营销团队准确理解各广告渠道的真实贡献,优化营销策略和预算分配,最大化投资回报率,特别适合多渠道数字营销场景。