.net 把公衆號網頁引用到自己的網站中(圖片解析和視頻解析)

1. 隨便找個公衆號地址我們來抓一下代碼

   測試的地址:https://mp.weixin.qq.com/s?__biz=MjM5MDk4NTg2MA==&mid=2652650152&idx=1&sn=93e4aeb524ee94cb64d1e9dbd5cf0266&chksm=bd54e1438a236855e93c27f0565b91380277ab484f8deda25745986b295b58046d75e52a68e2&scene=21#wechat_redirect

(隨便找的)

2.我們正常的抓

     /// <summary>
        /// 根據網址的URL,獲取源代碼HTML
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetHtmlByUrl(string url)
        {
            using (WebClient wc = new WebClient())
            {
                try
                {
                    wc.UseDefaultCredentials = true;
                    wc.Proxy = new WebProxy();
                    wc.Proxy.Credentials = CredentialCache.DefaultCredentials;
                    wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
                    byte[] bt = wc.DownloadData(url);
                    string txt = System.Text.Encoding.GetEncoding("GB2312").GetString(bt);
                    switch (GetCharset(txt).ToUpper())
                    {
                        case "UTF-8":
                            txt = System.Text.Encoding.UTF8.GetString(bt);
                            break;
                        case "UNICODE":
                            txt = System.Text.Encoding.Unicode.GetString(bt);
                            break;
                        default:
                            break;
                    }
                    return txt;
                }
                catch (Exception ex)
                {
                    return null;
                }
            }
        }


  /// <summary>
        /// 從HTML中獲取獲取charset
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static string GetCharset(string html)
        {
            string charset = "";
            Regex regCharset = new Regex(@"content=[""'].*\s*charset\b\s*=\s*""?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
            if (regCharset.IsMatch(html))
            {
                charset = regCharset.Match(html).Groups["charset"].Value;
            }
            if (charset.Equals(""))
            {
                regCharset = new Regex(@"<\s*meta\s*charset\s*=\s*[""']?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
                if (regCharset.IsMatch(html))
                {
                    charset = regCharset.Match(html).Groups["charset"].Value;
                }
            }
            return charset;
        }

3. 我們將爬出來的網頁保存一哈瞅瞅看

   

  /// <summary>
        /// 保存Html
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static bool SaveHtml(string html)
        {
            try
            {
                using (StreamWriter sw = new StreamWriter("MyHtml.html", false, Encoding.UTF8))
                {
                    sw.WriteLine(html);//將字符串寫入到文本中
                }
                return true;
            }
            catch (Exception ex)
            {
                return false;
            }
        }

 這樣調用

  var url = "https://mp.weixin.qq.com/s?__biz=MjM5MDk4NTg2MA==&mid=2652650152&idx=1&sn=93e4aeb524ee94cb64d1e9dbd5cf0266&chksm=bd54e1438a236855e93c27f0565b91380277ab484f8deda25745986b295b58046d75e52a68e2&scene=21#wechat_redirect";
            var backHtml = GetHtmlByUrl(url);
            var saveResult = SaveHtml(backHtml);

這是原網頁

這是我們爬到本地的網頁

此時我們F12打開調試可以看到,很多錯誤信息

我們可以看到,js用的是本地的地址,我們這裏需要把原來Html的代碼給改了(location.protocol,location.href,location.host) 採用絕對路徑 

4.我們採用第三方的 AngleSharp 來改Html代碼 (以前都寫正則,現在忘了怎麼寫了。。),直接Nuget安裝就行了

 

     /// <summary>
        /// 處理html
        /// </summary>
        /// <param name="html"></param>
        /// <param name="url">請求地址</param>
        /// <returns></returns>
        public static string HandleHtml(string html, string url)
        {
            var parser = new HtmlParser();
            var document = parser.ParseDocument(html);
            Uri myurl = new Uri(url);
            //追加自定義節點
            var addNode = document.CreateElement("script");
            addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
            document.Head.Append(addNode);
            var newOuterHtml = document.DocumentElement.OuterHtml;
            //全局替換 "//res
            return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost");
        }

  調用方式我們也改一下

  

          if (!string.IsNullOrWhiteSpace(backHtml))
            {
                var newHtml = HandleHtml(backHtml, url);
                var saveResult = SaveHtml(newHtml);
            }

然後我們在看看什麼效果

  圖片出來了,視頻沒有出來。。。

 繼續打開調試,發現是有些js沒有加載,

還是用的本地的路徑,因爲html引用的js還是用的網絡地址,除非把js下載下來,於是,我們把爬下面的網頁掛在IIS下試試

一個是圖片未授權,一個是圖片跨域 。。

 未授權好辦,直接html加一段代碼 ,跨域我想到的辦法是自己服務器替換一次(就是說圖片地址改爲我們服務器的地址)直接上處理過的代碼

        [HttpGet]
        [Route("GetImgStream")]
        public HttpResponseMessage GetImgStream([FromUri]string imgUrl)
        {
            var ss=HttpContext.Current.Request.QueryString;
            var imageBuffer = GetDownloadStream(imgUrl);
            var respimg = new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new System.Net.Http.ByteArrayContent(imageBuffer)
            };
            respimg.Content.Headers.ContentType = new MediaTypeHeaderValue("image/jpg");
            return respimg;
        }


        /// <summary>
        /// 獲取網絡文件的二進制流
        /// </summary>
        /// <param name="url">騰訊雲地址(騰訊雲文件的完整地址)</param>
        /// <returns></returns>
        public  byte[] GetDownloadStream(string url)
        {
            try
            {
                // 設置參數
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.Method = "get";
                //發送請求並獲取相應迴應數據
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                Stream responseStream = response.GetResponseStream();
                WebHeaderCollection header = response.Headers;
                string lastmodify = header["Last-Modified"];
                byte[] bArr = null;
                using (MemoryStream ms = new MemoryStream())
                {
                    int b;
                    while ((b = responseStream.ReadByte()) != -1)
                    {
                        ms.WriteByte((byte)b);
                    }
                    bArr = ms.ToArray();
                }
                responseStream.Close();
                responseStream.Dispose();
                return bArr;
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }

 我們寫一個webapi,然後把html所有的圖片地址都換成我們的api地址動態獲取圖片,於是我們的代碼改成了這樣

      /// <summary>
        /// 處理html
        /// </summary>
        /// <param name="html"></param>
        /// <param name="url">請求地址</param>
        /// <returns></returns>
        public static string HandleHtml(string html, string url)
        {
            var parser = new HtmlParser();
            var document = parser.ParseDocument(html);
            Uri myurl = new Uri(url);

            //增加跨域節點
            var addMetaDom = document.CreateElement("meta");
            addMetaDom.SetAttribute("name", "referrer");
            addMetaDom.SetAttribute("content", "never");
            document.Head.Append(addMetaDom);


            //追加自定義節點
            var addNode = document.CreateElement("script");
            addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
            document.Head.Append(addNode);

            //解析link標籤
            var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
            foreach (var item in blueListItemsLinq)
            {
                var oldHtml = item.OuterHtml;
                var href = item.GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    if (href.Length > 2)
                    {
                        if (href[0] == '/' && href[1] == '/')
                        {
                            var newHref = "https:" + href;
                            var replaceHtml = oldHtml.Replace(href, newHref);
                            item.OuterHtml = replaceHtml;
                        }
                    }
                }
            }
            //解析iframe
            var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
            foreach (var item in videoItemLinq)
            {
                var vid = item.GetAttribute("data-mpvid");
                if (string.IsNullOrWhiteSpace(vid))
                {
                    continue;
                }
                var realUrl = GetRealVideo(vid);
                //替換當前iframe
                if (!string.IsNullOrWhiteSpace(realUrl))
                {
                    var addvideoNode = document.CreateElement("video");
                    addvideoNode.SetAttribute("src", realUrl);
                    addvideoNode.SetAttribute("controls", "controls");
                    item.Parent.AppendChild(addvideoNode);
                    item.Parent.RemoveChild(item);
                }
            }
            var newOuterHtml = document.DocumentElement.OuterHtml;
            //全局替換 "//res
            return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
                .Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
                .Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
        }

好了,圖片出來了,視頻也出來了,本以爲大功告成,突然發現視頻還有一種格式的

打開F12,我們可以看到這個視頻是iframe套了一層,而且路徑不能直接打開

這就是視頻的加密機制,於是我們需要把視頻給解密出來 ,還是看請求來分析

我們注意到這個請求,url就是真實的視頻地址,而入參的id就是上面那個iframe的id,於是,我們需要把視頻先解密出來,然後把Iframe替換成video標籤

        /// <summary>
        /// 處理html
        /// </summary>
        /// <param name="html"></param>
        /// <param name="url">請求地址</param>
        /// <returns></returns>
        public static string HandleHtml(string html, string url)
        {
            var parser = new HtmlParser();
            var document = parser.ParseDocument(html);
            Uri myurl = new Uri(url);

            //增加跨域節點
            var addMetaDom = document.CreateElement("meta");
            addMetaDom.SetAttribute("name", "referrer");
            addMetaDom.SetAttribute("content", "never");
            document.Head.Append(addMetaDom);


            //追加自定義節點
            var addNode = document.CreateElement("script");
            addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
            document.Head.Append(addNode);

            //解析link標籤
            var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
            foreach (var item in blueListItemsLinq)
            {
                var oldHtml = item.OuterHtml;
                var href = item.GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    if (href.Length > 2)
                    {
                        if (href[0] == '/' && href[1] == '/')
                        {
                            var newHref = "https:" + href;
                            var replaceHtml = oldHtml.Replace(href, newHref);
                            item.OuterHtml = replaceHtml;
                        }
                    }
                }
            }
            //解析iframe
            var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
            foreach (var item in videoItemLinq)
            {
                var vid = item.GetAttribute("data-mpvid");
                if (string.IsNullOrWhiteSpace(vid))
                {
                    continue;
                }
                var realUrl = GetRealVideo(vid);
                //替換當前iframe
                if (!string.IsNullOrWhiteSpace(realUrl))
                {
                    var addvideoNode = document.CreateElement("video");
                    addvideoNode.SetAttribute("src", realUrl);
                    addvideoNode.SetAttribute("controls", "controls");
                    item.Parent.AppendChild(addvideoNode);
                    item.Parent.RemoveChild(item);
                }
            }
            var newOuterHtml = document.DocumentElement.OuterHtml;
            //全局替換 "//res
            return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
                .Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
                .Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
        }








        /// <summary>
        ///  根據vid獲取公衆號視頻的真實地址
        /// </summary>
        /// <param name="vid"></param>
        /// <returns></returns>
        public static string GetRealVideo(string vid)
        {
            string txUrl = $"https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzI4NDk4OTQxMg==&mid=2247485200&idx=1&vid={vid}&uin=&key=&pass_ticket=&wxtoken=777&appmsg_token=&x5=0&f=json";
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(txUrl);
            request.Method = "GET";
            var backJson = GetBackHtml(request, "1");
            var realUrl = JsonConvert.DeserializeObject<VideoSerializeModel>(backJson);
            if (realUrl == null || realUrl.videoinfos.Count() == 0)
            {
                throw new Exception("視頻解析錯誤");
            }
            return realUrl.videoinfos[0].url;
        }
  public class VideoSerializeModel
    {
        /// <summary>
        /// 視頻標題
        /// </summary>
        public string title { get; set; }

        [JsonProperty(PropertyName = "url_info")]
        public List<VideoModel> videoinfos = new List<VideoModel>();
    }


    public class VideoModel
    {
        public string duration_ms { get; set; }
        public string filesize { get; set; }
        public string format_id { get; set; }
        public string height { get; set; }
        public string url { get; set; }
        public string width { get; set; }
    }

然後大功告成,我們就可以根據公衆號網頁的url  直接放到我們自己的網站上了

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章