<?php
// sitemap-generator-PHP-FULL.php - Crawler brute-force complet sans limite

error_reporting(E_ALL);
ini_set('display_errors', 1);
ini_set('memory_limit','2048M');
set_time_limit(0);

$startUrl = $_GET['start_url'] ?? 'https://example.com';
$baseOrigin = parse_url($startUrl, PHP_URL_HOST);
$baseUrl = 'https://' . $baseOrigin;

header('Content-Type: application/xml; charset=utf-8');
ob_start();
echo '<?xml version="1.0" encoding="UTF-8"?>'."\n";
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'."\n";

// --- User Agents pour rotation ---
$userAgents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1 Safari/604.1',
];

// --- Brute-force wordlist pour pages orphelines ---
$bruteForceList = [
    'sitemap.xml','sitemap_index.xml','index.html','home','blog','news','products','category','page','search',
    'about','contact','faq','shop','cart','login','admin','dashboard'
];

// --- Fonctions utilitaires ---
function normalize_url($url){
    return rtrim($url,'/');
}
function resolve_url($base,$href){
    if(!$href) return null;
    if(parse_url($href, PHP_URL_SCHEME)) return $href;
    if(strpos($href,'//')===0) return 'https:'.$href;
    $base = rtrim($base,'/');
    return $base.'/'.ltrim($href,'/');
}

// --- Crawl principal ---
$queue = [$startUrl];
$visited = [];
$urls = [];

// Ajouter start URL
$visited[$startUrl] = true;
$urls[$startUrl] = ['lastmod'=>date('Y-m-d')];

while(!empty($queue)){
    $url = array_shift($queue);
    $ua = $userAgents[array_rand($userAgents)];

    $ch = curl_init();
    curl_setopt_array($ch, [
        CURLOPT_URL=>$url,
        CURLOPT_RETURNTRANSFER=>true,
        CURLOPT_FOLLOWLOCATION=>true,
        CURLOPT_TIMEOUT=>15,
        CURLOPT_USERAGENT=>$ua,
        CURLOPT_SSL_VERIFYPEER=>false,
    ]);
    $html = curl_exec($ch);
    curl_close($ch);

    if(!$html) continue;

    // --- Extraire tous les liens <a> ---
    libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $xpath = new DOMXPath($dom);
    $links = $xpath->query('//a[@href]');

    foreach($links as $link){
        $href = $link->getAttribute('href');
        if(!$href) continue;
        if(strpos($href,'javascript:')===0 || strpos($href,'mailto:')===0) continue;
        $full = resolve_url($url,$href);
        if(parse_url($full, PHP_URL_HOST)!==$baseOrigin) continue;
        $norm = normalize_url($full);
        if(!isset($visited[$norm])){
            $visited[$norm] = true;
            $urls[$norm] = ['lastmod'=>date('Y-m-d')];
            $queue[] = $norm;
        }
    }
}

// --- Brute-force pour pages orphelines ---
foreach($bruteForceList as $p){
    $u = normalize_url($baseUrl.'/'.$p);
    if(!isset($visited[$u])){
        $ua = $userAgents[array_rand($userAgents)];
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL=>$u,
            CURLOPT_NOBODY=>true,
            CURLOPT_RETURNTRANSFER=>true,
            CURLOPT_TIMEOUT=>5,
            CURLOPT_USERAGENT=>$ua,
            CURLOPT_SSL_VERIFYPEER=>false,
        ]);
        curl_exec($ch);
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);
        if(in_array($code,[200,301,302])){
            $visited[$u]=true;
            $urls[$u] = ['lastmod'=>date('Y-m-d')];
        }
    }
}

// --- Générer le XML sitemap ---
foreach($urls as $u=>$data){
    echo "<url>\n";
    echo "  <loc>".htmlspecialchars($u,ENT_QUOTES,'UTF-8')."</loc>\n";
    echo "  <lastmod>{$data['lastmod']}</lastmod>\n";
    echo "</url>\n";
}

echo '</urlset>';

// --- Sauvegarder sitemap.xml ---
file_put_contents('sitemap.xml',ob_get_contents());
ob_end_flush();
?>
