PYTHON实现跨渠道广告归因与效果分析系统

安丨

于 2025-07-17 13:49:47 发布

阅读量400

点赞数 3

CC 4.0 BY-SA版权

文章标签： python 开发语言

本文链接：https://blog.csdn.net/y131673/article/details/149420144

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
import logging
from abc import ABC, abstractmethod
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import hashlib
import uuid
import json
from concurrent.futures import ThreadPoolExecutor
from functools import partial

# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('AdAttributionSystem')

@dataclass
class TouchPoint:
    """广告触点数据结构"""
    id: str
    user_id: str
    channel: str
    campaign_id: str
    creative_id: str
    timestamp: datetime
    cost: float
    event_type: str  # impression, click, conversion
    conversion_value: Optional[float] = None
    device_type: Optional[str] = None
    location: Optional[str] = None

@dataclass
class UserJourney:
    """用户旅程数据结构"""
    user_id: str
    touch_points: List[TouchPoint]
    conversion_value: float
    conversion_time: datetime
    path_length: int = 0
    time_to_convert: timedelta = timedelta(0)

class AttributionModel(ABC):
    """归因模型抽象基类"""
    
    @abstractmethod
    def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
        """分配转化功劳给各个触点"""
        pass

class LastClickModel(AttributionModel):
    """最后一次点击归因模型"""
    
    def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
        """将100%功劳归于最后一次点击"""
        attribution = defaultdict(float)
        
        for journey in journeys:
            if not journey.touch_points:
                continue
                
            # 找到最后一次点击
            last_click = None
            for point in reversed(journey.touch_points):
                if point.event_type == 'click':
                    last_click = point
                    break
            
            if last_click:
                attribution_key = self._get_attribution_key(last_click)
                attribution[attribution_key] += journey.conversion_value
        
        return dict(attribution)
    
    def _get_attribution_key(self, touch_point: TouchPoint) -> str:
        """生成归因键(可根据需要调整)"""
        return f"{touch_point.channel}:{touch_point.campaign_id}"

class LinearModel(AttributionModel):
    """线性归因模型"""
    
    def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
        """将功劳平均分配给所有触点"""
        attribution = defaultdict(float)
        
        for journey in journeys:
            if not journey.touch_points:
                continue
                
            # 计算每个触点的功劳
            touch_points = [p for p in journey.touch_points if p.event_type in ['click', 'impression']]
            value_per_point = journey.conversion_value / len(touch_points) if touch_points else 0
            
            for point in touch_points:
                attribution_key = self._get_attribution_key(point)
                attribution[attribution_key] += value_per_point
        
        return dict(attribution)
    
    def _get_attribution_key(self, touch_point: TouchPoint) -> str:
        return f"{touch_point.channel}:{touch_point.campaign_id}:{touch_point.creative_id}"

class TimeDecayModel(AttributionModel):
    """时间衰减归因模型"""
    
    def __init__(self, half_life: timedelta = timedelta(hours=12)):
        self.half_life = half_life
        logger.info(f"时间衰减模型初始化 - 半衰期: {half_life}")
    
    def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
        """根据时间衰减分配功劳"""
        attribution = defaultdict(float)
        
        for journey in journeys:
            if not journey.touch_points:
                continue
                
            # 过滤有效触点
            touch_points = [p for p in journey.touch_points if p.event_type in ['click', 'impression']]
            if not touch_points:
                continue
                
            # 计算每个触点的权重
            weights = self._calculate_weights(
                touch_points, 
                journey.conversion_time
            )
            total_weight = sum(weights)
            if total_weight <= 0:
                continue
                
            # 分配功劳
            for point, weight in zip(touch_points, weights):
                attribution_key = self._get_attribution_key(point)
                attribution[attribution_key] += journey.conversion_value * (weight / total_weight)
        
        return dict(attribution)
    
    def _calculate_weights(self, touch_points: List[TouchPoint], conversion_time: datetime) -> List[float]:
        """计算时间衰减权重"""
        weights = []
        for point in touch_points:
            time_diff = conversion_time - point.timestamp
            half_lives = time_diff / self.half_life
            weight = 0.5 ** half_lives.total_seconds() / (3600 * 12)  # 转换为半衰期单位
            weights.append(weight)
        return weights
    
    def _get_attribution_key(self, touch_point: TouchPoint) -> str:
        return f"{touch_point.channel}:{touch_point.campaign_id}"

class DataDrivenModel(AttributionModel):
    """数据驱动归因模型(马尔可夫链)"""
    
    def __init__(self, n_simulations: int = 10000):
        self.n_simulations = n_simulations
        self.transition_counts = defaultdict(int)
        self.removal_effects = {}
        logger.info("数据驱动归因模型初始化")
    
    def attribute_conversions(self, journeys: List[UserJourney]) -> Dict[str, float]:
        """使用马尔可夫链计算移除效应"""
        if not journeys:
            return {}
            
        # 1. 构建状态转移矩阵
        self._build_transition_matrix(journeys)
        
        # 2. 计算基准转化率
        baseline_conversion = self._calculate_baseline_conversion(journeys)
        if baseline_conversion <= 0:
            return {}
            
        # 3. 计算每个渠道的移除效应
        channel_effects = {}
        all_channels = self._get_all_channels(journeys)
        
        for channel in all_channels:
            removal_conversion = self._calculate_removal_conversion(journeys, channel)
            effect = (baseline_conversion - removal_conversion) / baseline_conversion
            channel_effects[channel] = max(0, effect)  # 确保非负
        
        # 4. 归一化分配功劳
        total_effect = sum(channel_effects.values())
        if total_effect <= 0:
            return {}
            
        attribution = {
            channel: (effect / total_effect) * sum(j.conversion_value for j in journeys)
            for channel, effect in channel_effects.items()
        }
        
        return attribution
    
    def _build_transition_matrix(self, journeys: List[UserJourney]):
        """构建状态转移矩阵"""
        self.transition_counts.clear()
        
        for journey in journeys:
            path = self._get_path(journey)
            if len(path) < 2:
                continue
                
            # 记录状态转移
            for i in range(len(path) - 1):
                from_state, to_state = path[i], path[i+1]
                self.transition_counts[(from_state, to_state)] += 1
            
            # 记录转换状态
            last_state = path[-1]
            self.transition_counts[(last_state, 'conversion')] += 1
    
    def _get_path(self, journey: UserJourney) -> List[str]:
        """获取用户路径"""
        touch_points = sorted(journey.touch_points, key=lambda x: x.timestamp)
        return [f"{point.channel}:{point.event_type}" for point in touch_points]
    
    def _calculate_baseline_conversion(self, journeys: List[UserJourney]) -> float:
        """计算基准转化率"""
        total_paths = 0
        converting_paths = 0
        
        for journey in journeys:
            path = self._get_path(journey)
            if not path:
                continue
                
            total_paths += 1
            if journey.conversion_value > 0:
                converting_paths += 1
        
        return converting_paths / total_paths if total_paths > 0 else 0
    
    def _get_all_channels(self, journeys: List[UserJourney]) -> Set[str]:
        """获取所有唯一渠道"""
        channels = set()
        for journey in journeys:
            for point in journey.touch_points:
                channels.add(point.channel)
        return channels
    
    def _calculate_removal_conversion(self, journeys: List[UserJourney], channel: str) -> float:
        """计算移除某渠道后的转化率"""
        total_paths = 0
        converting_paths = 0
        
        for journey in journeys:
            path = self._get_path(journey)
            if not path:
                continue
                
            # 检查路径是否包含该渠道
            has_channel = any(f"{channel}:" in state for state in path)
            if not has_channel:
                # 路径不受影响
                total_paths += 1
                if journey.conversion_value > 0:
                    converting_paths += 1
                continue
                
            # 模拟移除该渠道后的路径
            modified_path = [state for state in path if f"{channel}:" not in state]
            if not modified_path:
                # 路径完全被移除
                continue
                
            # 检查修改后的路径是否仍然有效
            total_paths += 1
            if self._simulate_conversion(modified_path):
                converting_paths += 1
        
        return converting_paths / total_paths if total_paths > 0 else 0
    
    def _simulate_conversion(self, path: List[str]) -> bool:
        """模拟路径的转化概率"""
        if not path:
            return False
            
        # 简化版模拟 - 实际应使用马尔可夫链计算
        last_state = path[-1]
        conversion_count = self.transition_counts.get((last_state, 'conversion'), 0)
        total_transitions = sum(
            count for (from_state, to_state), count in self.transition_counts.items()
            if from_state == last_state
        )
        
        if total_transitions == 0:
            return False
            
        conversion_prob = conversion_count / total_transitions
        return random.random() < conversion_prob

class AdAttributionAnalyzer:
    """广告归因分析系统"""
    
    def __init__(self, models: Dict[str, AttributionModel]):
        self.models = models
        self.user_journeys = []
        logger.info("广告归因分析系统初始化")
    
    def add_touch_points(self, touch_points: List[TouchPoint]):
        """添加广告触点数据并构建用户旅程"""
        # 按用户ID分组
        user_points = defaultdict(list)
        for point in touch_points:
            user_points[point.user_id].append(point)
        
        # 构建用户旅程
        new_journeys = []
        for user_id, points in user_points.items():
            # 找到转化事件
            conversions = [p for p in points if p.event_type == 'conversion']
            if not conversions:
                continue
                
            # 假设每个用户旅程只有一个转化
            conversion = conversions[0]
            other_points = [p for p in points if p.event_type != 'conversion']
            
            # 按时间排序触点
            sorted_points = sorted(other_points, key=lambda x: x.timestamp)
            
            # 计算路径特征
            path_length = len(sorted_points)
            time_to_convert = conversion.timestamp - sorted_points[0].timestamp if sorted_points else timedelta(0)
            
            journey = UserJourney(
                user_id=user_id,
                touch_points=sorted_points,
                conversion_value=conversion.conversion_value or 0,
                conversion_time=conversion.timestamp,
                path_length=path_length,
                time_to_convert=time_to_convert
            )
            new_journeys.append(journey)
        
        self.user_journeys.extend(new_journeys)
        logger.info(f"添加 {len(new_journeys)} 条用户旅程，总计 {len(self.user_journeys)} 条")
    
    def run_attribution(self) -> Dict[str, Dict[str, float]]:
        """运行所有归因模型"""
        results = {}
        
        for model_name, model in self.models.items():
            logger.info(f"运行归因模型: {model_name}")
            start_time = time.time()
            
            attribution = model.attribute_conversions(self.user_journeys)
            results[model_name] = attribution
            
            elapsed = time.time() - start_time
            logger.info(f"完成 {model_name} 归因 - 耗时 {elapsed:.2f}秒")
        
        return results
    
    def analyze_journeys(self) -> Dict:
        """分析用户旅程特征"""
        if not self.user_journeys:
            return {}
            
        # 基本统计
        num_journeys = len(self.user_journeys)
        total_conversion_value = sum(j.conversion_value for j in self.user_journeys)
        avg_path_length = np.mean([j.path_length for j in self.user_journeys])
        avg_time_to_convert = np.mean([j.time_to_convert.total_seconds() for j in self.user_journeys]) / 3600  # 转换为小时
        
        # 渠道参与度
        channel_participation = defaultdict(int)
        for journey in self.user_journeys:
            channels = set(p.channel for p in journey.touch_points)
            for channel in channels:
                channel_participation[channel] += 1
        
        # 路径模式聚类
        journey_features = self._extract_journey_features()
        if len(journey_features) > 5:  # 有足够数据时才聚类
            cluster_labels = self._cluster_journeys(journey_features)
            unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
            cluster_info = dict(zip(unique_clusters, cluster_counts))
        else:
            cluster_info = {}
        
        return {
            'num_journeys': num_journeys,
            'total_conversion_value': total_conversion_value,
            'avg_path_length': avg_path_length,
            'avg_time_to_convert': avg_time_to_convert,
            'channel_participation': dict(channel_participation),
            'journey_clusters': cluster_info
        }
    
    def _extract_journey_features(self) -> List[Dict]:
        """提取用户旅程特征"""
        features = []
        channel_set = self._get_all_channels()
        
        for journey in self.user_journeys:
            # 基本特征
            feat = {
                'path_length': journey.path_length,
                'time_to_convert': journey.time_to_convert.total_seconds(),
                'conversion_value': journey.conversion_value
            }
            
            # 渠道存在特征
            journey_channels = set(p.channel for p in journey.touch_points)
            for channel in channel_set:
                feat[f'channel_{channel}'] = 1 if channel in journey_channels else 0
            
            features.append(feat)
        
        return features
    
    def _get_all_channels(self) -> Set[str]:
        """获取所有唯一渠道"""
        channels = set()
        for journey in self.user_journeys:
            for point in journey.touch_points:
                channels.add(point.channel)
        return channels
    
    def _cluster_journeys(self, features: List[Dict]) -> np.ndarray:
        """对用户旅程进行聚类"""
        # 转换为DataFrame
        df = pd.DataFrame(features)
        
        # 标准化特征
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(df)
        
        # 确定最佳聚类数
        max_clusters = min(10, len(features) - 1)
        best_score = -1
        best_k = 2
        
        for k in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(scaled_features)
            score = silhouette_score(scaled_features, labels)
            
            if score > best_score:
                best_score = score
                best_k = k
        
        # 使用最佳k值进行最终聚类
        final_kmeans = KMeans(n_clusters=best_k, random_state=42)
        labels = final_kmeans.fit_predict(scaled_features)
        
        logger.info(f"完成用户旅程聚类 - 最佳k值: {best_k}, 轮廓系数: {best_score:.2f}")
        return labels
    
    def visualize_attribution(self, attribution_results: Dict[str, Dict[str, float]]):
        """可视化归因结果"""
        if not attribution_results:
            return
            
        # 准备数据
        data = []
        for model_name, attributions in attribution_results.items():
            for channel, value in attributions.items():
                data.append({
                    'Model': model_name,
                    'Channel': channel.split(':')[0],  # 只取渠道名
                    'Attributed Value': value
                })
        
        df = pd.DataFrame(data)
        
        # 绘制堆叠柱状图
        plt.figure(figsize=(12, 6))
        sns.barplot(
            data=df,
            x='Channel',
            y='Attributed Value',
            hue='Model',
            estimator=sum,
            ci=None
        )
        plt.title('各渠道归因价值对比')
        plt.ylabel('归因价值')
        plt.xlabel('广告渠道')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # 绘制归一化百分比图
        plt.figure(figsize=(12, 6))
        norm_df = df.groupby(['Model', 'Channel']).sum().groupby(level=0).apply(
            lambda x: 100 * x / x.sum()
        ).reset_index()
        
        sns.barplot(
            data=norm_df,
            x='Channel',
            y='Attributed Value',
            hue='Model'
        )
        plt.title('各渠道归因价值占比(%)')
        plt.ylabel('归因价值占比')
        plt.xlabel('广告渠道')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# 示例使用
if __name__ == "__main__":
    # 1. 初始化归因模型
    models = {
        'last_click': LastClickModel(),
        'linear': LinearModel(),
        'time_decay': TimeDecayModel(half_life=timedelta(hours=24)),
        'data_driven': DataDrivenModel(n_simulations=5000)
    }
    
    analyzer = AdAttributionAnalyzer(models)
    
    # 2. 生成模拟数据
    def generate_mock_data(num_users: int = 1000) -> List[TouchPoint]:
        channels = ['google', 'facebook', 'instagram', 'twitter', 'email']
        campaigns = [f'campaign_{i}' for i in range(1, 6)]
        devices = ['mobile', 'desktop', 'tablet']
        locations = ['US', 'UK', 'CA', 'AU', 'DE']
        
        touch_points = []
        for user_id in range(1, num_users + 1):
            user_id = str(uuid.uuid4())
            
            # 决定用户是否转化
            will_convert = random.random() < 0.3  # 30%转化率
            
            # 生成触点序列
            num_touch_points = random.randint(1, 8)
            touch_point_sequence = []
            
            for i in range(num_touch_points):
                channel = random.choice(channels)
                campaign = random.choice(campaigns)
                device = random.choice(devices)
                location = random.choice(locations)
                
                # 随机决定事件类型
                if i == num_touch_points - 1 and will_convert:
                    event_type = 'conversion'
                    conversion_value = random.uniform(10, 100)
                else:
                    event_type = random.choices(
                        ['impression', 'click'],
                        weights=[0.7, 0.3],
                        k=1
                    )[0]
                    conversion_value = None
                
                # 生成时间戳(模拟几天内的活动)
                timestamp = datetime.now() - timedelta(
                    days=random.uniform(0, 7),
                    hours=random.uniform(0, 24)
                )
                
                # 生成成本(仅对click和impression)
                cost = random.uniform(0.1, 2.0) if event_type != 'conversion' else 0
                
                touch_point = TouchPoint(
                    id=str(uuid.uuid4()),
                    user_id=user_id,
                    channel=channel,
                    campaign_id=campaign,
                    creative_id=f"creative_{random.randint(1, 10)}",
                    timestamp=timestamp,
                    cost=cost,
                    event_type=event_type,
                    conversion_value=conversion_value,
                    device_type=device,
                    location=location
                )
                touch_point_sequence.append(touch_point)
            
            touch_points.extend(touch_point_sequence)
        
        return touch_points
    
    logger.info("生成模拟数据...")
    mock_data = generate_mock_data(2000)
    analyzer.add_touch_points(mock_data)
    
    # 3. 分析用户旅程
    journey_analysis = analyzer.analyze_journeys()
    print("\n用户旅程分析结果:")
    print(f"总旅程数: {journey_analysis['num_journeys']}")
    print(f"总转化价值: ${journey_analysis['total_conversion_value']:.2f}")
    print(f"平均路径长度: {journey_analysis['avg_path_length']:.1f}")
    print(f"平均转化时间: {journey_analysis['avg_time_to_convert']:.1f} 小时")
    print("\n渠道参与度:")
    for channel, count in journey_analysis['channel_participation'].items():
        print(f"{channel}: {count} 次")
    
    # 4. 运行归因分析
    logger.info("运行归因分析...")
    attribution_results = analyzer.run_attribution()
    
    print("\n归因分析结果:")
    for model_name, attributions in attribution_results.items():
        print(f"\n{model_name} 模型:")
        for channel, value in attributions.items():
            print(f"{channel}: ${value:.2f}")
    
    # 5. 可视化结果
    analyzer.visualize_attribution(attribution_results)

使用说明

功能特点

多模型归因分析：
- 最后一次点击归因
- 线性归因
- 时间衰减归因
- 数据驱动(马尔可夫链)归因
用户旅程分析：
- 路径长度统计
- 转化时间分析
- 渠道参与度计算
- 旅程模式聚类
高级分析功能：
- 渠道移除效应计算
- 转化概率模拟
- 多维度特征提取
可视化支持：
- 归因价值对比图
- 渠道贡献百分比
- 旅程聚类分析

核心组件

TouchPoint：
- 广告触点数据模型
- 包含渠道、时间、成本等元数据
UserJourney：
- 用户转化路径模型
- 包含触点序列和转化信息
AttributionModel：
- 归因模型抽象基类
- 各具体模型实现
AdAttributionAnalyzer：
- 系统主控制器
- 数据管理与分析入口

使用方法

初始化系统：

models = {
    'last_click': LastClickModel(),
    'linear': LinearModel(),
    'time_decay': TimeDecayModel(),
    'data_driven': DataDrivenModel()
}
analyzer = AdAttributionAnalyzer(models)

导入数据：

# 从数据库或文件加载触点数据
touch_points = load_touch_points()  
analyzer.add_touch_points(touch_points)

运行分析：

# 分析用户旅程特征
journey_analysis = analyzer.analyze_journeys()

# 运行归因模型
attribution_results = analyzer.run_attribution()

可视化结果：

analyzer.visualize_attribution(attribution_results)

获取洞察：

# 获取最佳归因模型的结果
data_driven_results = attribution_results['data_driven']

# 识别高价值渠道
top_channels = sorted(data_driven_results.items(), key=lambda x: -x[1])[:3]