1. 隨便找個公衆號地址我們來抓一下代碼
(隨便找的)
2.我們正常的抓
/// <summary>
/// 根據網址的URL,獲取源代碼HTML
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetHtmlByUrl(string url)
{
using (WebClient wc = new WebClient())
{
try
{
wc.UseDefaultCredentials = true;
wc.Proxy = new WebProxy();
wc.Proxy.Credentials = CredentialCache.DefaultCredentials;
wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
byte[] bt = wc.DownloadData(url);
string txt = System.Text.Encoding.GetEncoding("GB2312").GetString(bt);
switch (GetCharset(txt).ToUpper())
{
case "UTF-8":
txt = System.Text.Encoding.UTF8.GetString(bt);
break;
case "UNICODE":
txt = System.Text.Encoding.Unicode.GetString(bt);
break;
default:
break;
}
return txt;
}
catch (Exception ex)
{
return null;
}
}
}
/// <summary>
/// 從HTML中獲取獲取charset
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string GetCharset(string html)
{
string charset = "";
Regex regCharset = new Regex(@"content=[""'].*\s*charset\b\s*=\s*""?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
if (regCharset.IsMatch(html))
{
charset = regCharset.Match(html).Groups["charset"].Value;
}
if (charset.Equals(""))
{
regCharset = new Regex(@"<\s*meta\s*charset\s*=\s*[""']?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
if (regCharset.IsMatch(html))
{
charset = regCharset.Match(html).Groups["charset"].Value;
}
}
return charset;
}
3. 我們將爬出來的網頁保存一哈瞅瞅看
/// <summary>
/// 保存Html
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static bool SaveHtml(string html)
{
try
{
using (StreamWriter sw = new StreamWriter("MyHtml.html", false, Encoding.UTF8))
{
sw.WriteLine(html);//將字符串寫入到文本中
}
return true;
}
catch (Exception ex)
{
return false;
}
}
這樣調用
var url = "https://mp.weixin.qq.com/s?__biz=MjM5MDk4NTg2MA==&mid=2652650152&idx=1&sn=93e4aeb524ee94cb64d1e9dbd5cf0266&chksm=bd54e1438a236855e93c27f0565b91380277ab484f8deda25745986b295b58046d75e52a68e2&scene=21#wechat_redirect";
var backHtml = GetHtmlByUrl(url);
var saveResult = SaveHtml(backHtml);
這是原網頁
這是我們爬到本地的網頁
此時我們F12打開調試可以看到,很多錯誤信息
我們可以看到,js用的是本地的地址,我們這裏需要把原來Html的代碼給改了(location.protocol,location.href,location.host) 採用絕對路徑
4.我們採用第三方的 AngleSharp 來改Html代碼 (以前都寫正則,現在忘了怎麼寫了。。),直接Nuget安裝就行了
/// <summary>
/// 處理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">請求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//追加自定義節點
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替換 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost");
}
調用方式我們也改一下
if (!string.IsNullOrWhiteSpace(backHtml))
{
var newHtml = HandleHtml(backHtml, url);
var saveResult = SaveHtml(newHtml);
}
然後我們在看看什麼效果
圖片出來了,視頻沒有出來。。。
繼續打開調試,發現是有些js沒有加載,
還是用的本地的路徑,因爲html引用的js還是用的網絡地址,除非把js下載下來,於是,我們把爬下面的網頁掛在IIS下試試
一個是圖片未授權,一個是圖片跨域 。。
未授權好辦,直接html加一段代碼 ,跨域我想到的辦法是自己服務器替換一次(就是說圖片地址改爲我們服務器的地址)直接上處理過的代碼
[HttpGet]
[Route("GetImgStream")]
public HttpResponseMessage GetImgStream([FromUri]string imgUrl)
{
var ss=HttpContext.Current.Request.QueryString;
var imageBuffer = GetDownloadStream(imgUrl);
var respimg = new HttpResponseMessage(HttpStatusCode.OK)
{
Content = new System.Net.Http.ByteArrayContent(imageBuffer)
};
respimg.Content.Headers.ContentType = new MediaTypeHeaderValue("image/jpg");
return respimg;
}
/// <summary>
/// 獲取網絡文件的二進制流
/// </summary>
/// <param name="url">騰訊雲地址(騰訊雲文件的完整地址)</param>
/// <returns></returns>
public byte[] GetDownloadStream(string url)
{
try
{
// 設置參數
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "get";
//發送請求並獲取相應迴應數據
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
Stream responseStream = response.GetResponseStream();
WebHeaderCollection header = response.Headers;
string lastmodify = header["Last-Modified"];
byte[] bArr = null;
using (MemoryStream ms = new MemoryStream())
{
int b;
while ((b = responseStream.ReadByte()) != -1)
{
ms.WriteByte((byte)b);
}
bArr = ms.ToArray();
}
responseStream.Close();
responseStream.Dispose();
return bArr;
}
catch (Exception ex)
{
throw ex;
}
}
我們寫一個webapi,然後把html所有的圖片地址都換成我們的api地址動態獲取圖片,於是我們的代碼改成了這樣
/// <summary>
/// 處理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">請求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//增加跨域節點
var addMetaDom = document.CreateElement("meta");
addMetaDom.SetAttribute("name", "referrer");
addMetaDom.SetAttribute("content", "never");
document.Head.Append(addMetaDom);
//追加自定義節點
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
//解析link標籤
var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
foreach (var item in blueListItemsLinq)
{
var oldHtml = item.OuterHtml;
var href = item.GetAttribute("href");
if (!string.IsNullOrWhiteSpace(href))
{
if (href.Length > 2)
{
if (href[0] == '/' && href[1] == '/')
{
var newHref = "https:" + href;
var replaceHtml = oldHtml.Replace(href, newHref);
item.OuterHtml = replaceHtml;
}
}
}
}
//解析iframe
var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
foreach (var item in videoItemLinq)
{
var vid = item.GetAttribute("data-mpvid");
if (string.IsNullOrWhiteSpace(vid))
{
continue;
}
var realUrl = GetRealVideo(vid);
//替換當前iframe
if (!string.IsNullOrWhiteSpace(realUrl))
{
var addvideoNode = document.CreateElement("video");
addvideoNode.SetAttribute("src", realUrl);
addvideoNode.SetAttribute("controls", "controls");
item.Parent.AppendChild(addvideoNode);
item.Parent.RemoveChild(item);
}
}
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替換 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
.Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
.Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
}
好了,圖片出來了,視頻也出來了,本以爲大功告成,突然發現視頻還有一種格式的
打開F12,我們可以看到這個視頻是iframe套了一層,而且路徑不能直接打開
這就是視頻的加密機制,於是我們需要把視頻給解密出來 ,還是看請求來分析
我們注意到這個請求,url就是真實的視頻地址,而入參的id就是上面那個iframe的id,於是,我們需要把視頻先解密出來,然後把Iframe替換成video標籤
/// <summary>
/// 處理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">請求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//增加跨域節點
var addMetaDom = document.CreateElement("meta");
addMetaDom.SetAttribute("name", "referrer");
addMetaDom.SetAttribute("content", "never");
document.Head.Append(addMetaDom);
//追加自定義節點
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
//解析link標籤
var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
foreach (var item in blueListItemsLinq)
{
var oldHtml = item.OuterHtml;
var href = item.GetAttribute("href");
if (!string.IsNullOrWhiteSpace(href))
{
if (href.Length > 2)
{
if (href[0] == '/' && href[1] == '/')
{
var newHref = "https:" + href;
var replaceHtml = oldHtml.Replace(href, newHref);
item.OuterHtml = replaceHtml;
}
}
}
}
//解析iframe
var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
foreach (var item in videoItemLinq)
{
var vid = item.GetAttribute("data-mpvid");
if (string.IsNullOrWhiteSpace(vid))
{
continue;
}
var realUrl = GetRealVideo(vid);
//替換當前iframe
if (!string.IsNullOrWhiteSpace(realUrl))
{
var addvideoNode = document.CreateElement("video");
addvideoNode.SetAttribute("src", realUrl);
addvideoNode.SetAttribute("controls", "controls");
item.Parent.AppendChild(addvideoNode);
item.Parent.RemoveChild(item);
}
}
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替換 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
.Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
.Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
}
/// <summary>
/// 根據vid獲取公衆號視頻的真實地址
/// </summary>
/// <param name="vid"></param>
/// <returns></returns>
public static string GetRealVideo(string vid)
{
string txUrl = $"https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzI4NDk4OTQxMg==&mid=2247485200&idx=1&vid={vid}&uin=&key=&pass_ticket=&wxtoken=777&appmsg_token=&x5=0&f=json";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(txUrl);
request.Method = "GET";
var backJson = GetBackHtml(request, "1");
var realUrl = JsonConvert.DeserializeObject<VideoSerializeModel>(backJson);
if (realUrl == null || realUrl.videoinfos.Count() == 0)
{
throw new Exception("視頻解析錯誤");
}
return realUrl.videoinfos[0].url;
}
public class VideoSerializeModel
{
/// <summary>
/// 視頻標題
/// </summary>
public string title { get; set; }
[JsonProperty(PropertyName = "url_info")]
public List<VideoModel> videoinfos = new List<VideoModel>();
}
public class VideoModel
{
public string duration_ms { get; set; }
public string filesize { get; set; }
public string format_id { get; set; }
public string height { get; set; }
public string url { get; set; }
public string width { get; set; }
}
然後大功告成,我們就可以根據公衆號網頁的url 直接放到我們自己的網站上了