本篇文章更新時間:2019/02/16
如有資訊過時或語誤之處,歡迎使用 Contact 功能通知。
一介資男的 LINE 社群開站囉!歡迎入群聊聊~
如果本站內容對你有幫助,歡迎使用 BFX Pay 加密貨幣贊助支持。
這個題目在寫爬蟲的時候算滿常見會要解決的問題。
正好這次因為要爬的對象把資料改放到 Google 雲端硬碟上,所以就來紀錄一下這段程式吧!
是說指令版的
curl
實在是超好物,一跨到程式上就麻煩不少了XD
<?php
//Ref: http://php.net/manual/en/ref.curl.php#93163
function get_final_url($url, $cookie = "", $timeout = 5) {
$url = str_replace("&", "&", urldecode(trim($url)));
//保留餅乾資訊
if ($cookie == "") {
$cookie = tempnam(sys_get_temp_dir(), "mxp_");
}
//假一下瀏覽器請求
$user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36";
//CURL 初始化
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, "");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
$content = curl_exec($ch);
$response = curl_getinfo($ch);
curl_close($ch);
//設定根層級的假請求
ini_set("user_agent", $user_agent);
//請求標頭
$headers = get_headers($response['url']);
//判斷是否有 301, 302 轉址
if ($response['http_code'] == 301 || $response['http_code'] == 302) {
$location = "";
foreach ($headers as $value) {
if (substr(strtolower($value), 0, 9) == "location:") {
//取得轉址的連結,遞迴請求
return get_final_url(trim(substr($value, 9, strlen($value))), $cookie);
}
}
}
$filename = "";
foreach ($headers as $value) {
if (substr(strtolower($value), 0, 20) == "content-disposition:") {
//如果有檔案名稱資訊,就存下來等等下載用
$filename = urldecode(trim(substr($value, 20, strlen($value))));
}
}
//如果不是使用 301, 302 轉址,可能網頁內容會有 JavaScript 或提示轉址資訊,可以在這裡分析
// if (preg_match("/window.location.replace('(.*)')/i", $content, $value) ||
// preg_match("/window.location="(.*)"/i", $content, $value)
// ) {
// return get_final_url($value[1]);
// }
//把最後的連結檔案下載回去
$filename = explode("''", $filename);
$fp = fopen($filename[1], 'w');
$ch = curl_init($response['url']);
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt($ch, CURLOPT_FILE, $fp);
$data = curl_exec($ch);
curl_close($ch);
fclose($fp);
return array("filename" => $filename[1], "url" => $response['url']);
}
print_r(get_final_url('https://drive.google.com/uc?export=download&id=0B21MNCxxxxxxxxxxxxxxxxxxx1xdVk'));
Gist: Link
範例既然是針對 Google 雲端硬碟,就在最後解析出最終檔案位置時一併把他下載回來。