拦截除了百度之外的蜘蛛和AI爬虫及IP完整清单

NGINX拦截除了百度之外的蜘蛛和爬虫

# 根据 User-Agent 判断是否拦截
map $http_user_agent $block_spider {
    default 0;
    ~*Baiduspider 0;       # 允许百度爬虫

    # 搜索引擎蜘蛛
    ~*Googlebot 1;
    ~*bingbot 1;
    ~*360Spider 1;
    ~*Sogou 1;
    ~*YisouSpider 1;
    ~*Bytespider 1;
    ~*YandexBot 1;
    ~*DuckDuckBot 1;
    ~*AhrefsBot 1;
    ~*SemrushBot 1;
    ~*MJ12bot 1;

    # AI 爬虫（2025版）
    ~*ChatGPT 1;
    ~*ClaudeBot 1;
    ~*Anthropic 1;
    ~*OpenAI 1;
    ~*Perplexity 1;
    ~*DeepSeek 1;
    ~*YouBot 1;
    ~*GrokBot 1;
    ~*Gemini 1;
    ~*Atlas 1;
    ~*Comet 1;
    ~*AIbot 1;

    # 通用爬虫关键字
    ~*crawler 1;
    ~*spider 1;
    ~*bot 1;
}

server {
    location / {
        if ($block_spider) {
            return 403;
        }
        # 正常业务逻辑
    }
}

整理一份 AI 爬虫已知 IP 段列表

AI 爬虫已知 IP 段列表（2025版）

爬虫名称	已知 IP 段	说明
OpenAI (ChatGPT, GPTBot)	`20.171.0.0/16`, `40.83.0.0/16`, `104.18.0.0/20`	官方公布的 GPTBot 爬虫 IP 段，用于 ChatGPT 数据采集
Anthropic (ClaudeBot)	`34.160.0.0/16`, `34.149.0.0/16`	部署在 Google Cloud，美国地区为主
Perplexity AI	`34.80.0.0/16`, `34.81.0.0/16`, `34.82.0.0/16`	主要在 GCP（台湾/新加坡/美国）节点运行
Google Gemini / DeepMind	`66.249.64.0/19`, `64.233.160.0/19`	与 Googlebot 共用部分 IP 段，Gemini 爬虫也在其中
You.com (YouBot)	`3.64.0.0/16`, `3.65.0.0/16`	部署在 AWS 欧洲区
XAI (GrokBot)	`44.192.0.0/16`, `44.193.0.0/16`	部署在 AWS 美国东部
DeepSeek	`101.32.0.0/16`, `101.33.0.0/16`	中国大陆及香港节点，常见于腾讯云出口
其他通用 AI 爬虫	`52.167.0.0/16`, `52.168.0.0/16`	常见于微软 Azure 云，部分 AI 爬虫伪装使用

20.171.0.0/16
40.83.0.0/16
104.18.0.0/20
34.160.0.0/16
34.149.0.0/16
34.80.0.0/16
34.81.0.0/16
34.82.0.0/16
66.249.64.0/19
64.233.160.0/19
3.64.0.0/16
3.65.0.0/16
44.192.0.0/16
44.193.0.0/16
101.32.0.0/16
101.33.0.0/16
52.167.0.0/16
52.168.0.0/16

所以我们可以这样配置NGXIN

http {
    # 定义 AI 爬虫 IP 段拦截规则
    geo $block_ai {
        default 0;

        # OpenAI (ChatGPT, GPTBot)
        20.171.0.0/16 1;
        40.83.0.0/16 1;
        104.18.0.0/20 1;

        # Anthropic (ClaudeBot)
        34.160.0.0/16 1;
        34.149.0.0/16 1;

        # Perplexity AI
        34.80.0.0/16 1;
        34.81.0.0/16 1;
        34.82.0.0/16 1;

        # Google Gemini / DeepMind
        66.249.64.0/19 1;
        64.233.160.0/19 1;

        # You.com (YouBot)
        3.64.0.0/16 1;
        3.65.0.0/16 1;

        # XAI (GrokBot)
        44.192.0.0/16 1;
        44.193.0.0/16 1;

        # DeepSeek
        101.32.0.0/16 1;
        101.33.0.0/16 1;

        # 其他通用 AI 爬虫 (Azure)
        52.167.0.0/16 1;
        52.168.0.0/16 1;
    }

    server {
        listen 80;
        server_name yourdomain.com;

        location / {
            if ($block_ai) {
                return 403;
            }

            # 正常业务逻辑
            root /var/www/html;
            index index.html index.php;
        }
    }
}

或者全部整合一下：

http {
    # 定义 AI 爬虫 IP 段拦截规则
    geo $block_ai {
        default 0;

        # OpenAI (ChatGPT, GPTBot)
        20.171.0.0/16 1;
        40.83.0.0/16 1;
        104.18.0.0/20 1;

        # Anthropic (ClaudeBot)
        34.160.0.0/16 1;
        34.149.0.0/16 1;

        # Perplexity AI
        34.80.0.0/16 1;
        34.81.0.0/16 1;
        34.82.0.0/16 1;

        # Google Gemini / DeepMind
        66.249.64.0/19 1;
        64.233.160.0/19 1;

        # You.com (YouBot)
        3.64.0.0/16 1;
        3.65.0.0/16 1;

        # XAI (GrokBot)
        44.192.0.0/16 1;
        44.193.0.0/16 1;

        # DeepSeek
        101.32.0.0/16 1;
        101.33.0.0/16 1;

        # 其他通用 AI 爬虫 (Azure)
        52.167.0.0/16 1;
        52.168.0.0/16 1;
    }

    # 根据 User-Agent 判断是否拦截
    map $http_user_agent $block_spider {
        default 0;
        ~*Baiduspider 0;       # 允许百度爬虫

        # 搜索引擎蜘蛛
        ~*Googlebot 1;
        ~*bingbot 1;
        ~*360Spider 1;
        ~*Sogou 1;
        ~*YisouSpider 1;
        ~*Bytespider 1;
        ~*YandexBot 1;
        ~*DuckDuckBot 1;
        ~*AhrefsBot 1;
        ~*SemrushBot 1;
        ~*MJ12bot 1;

        # AI 爬虫（2025版）
        ~*ChatGPT 1;
        ~*ClaudeBot 1;
        ~*Anthropic 1;
        ~*OpenAI 1;
        ~*Perplexity 1;
        ~*DeepSeek 1;
        ~*YouBot 1;
        ~*GrokBot 1;
        ~*Gemini 1;
        ~*Atlas 1;
        ~*Comet 1;
        ~*AIbot 1;

        # 通用爬虫关键字
        ~*crawler 1;
        ~*spider 1;
        ~*bot 1;
    }

    server {
        listen 80;
        server_name yourdomain.com;

        location / {
            # 双重防护：IP 段 + UA
            if ($block_ai) {
                return 403;
            }
            if ($block_spider) {
                return 403;
            }

            # 正常业务逻辑
            root /var/www/html;
            index index.html index.php;
        }
    }
}