衆所周知,搜索引擎的製作是非常繁瑣和耗時的,對於企業級的搜索引擎的製作,需要有良好的蜘蛛程序,定期更新搜索資源庫,並且完善優化搜索引擎的速度和方法(比如全文搜索等),減少垃圾網頁的出現,是一個很值得深入研究的話題。
這裏我們當然不是教大家去做類似Google這樣強大的搜索引擎(個人力量有限),也不是簡單的調用googl的API來實現,這裏主要提供給大家怎麼對網頁信息進行篩選和查詢的功能,我們可以製作一個這樣簡單的搜索網頁的功能,放在我們的個人主頁上,作爲站內搜索的工具。
[本示例完整源碼下載(0分)] http://download.csdn.net/source/3513103
好了,言歸正傳,我們簡單看一下這個功能的實現過程:
首先我們建立一系列的站內網頁文件,這裏命名爲WebPage0~9,包含一些簡單的信息
給出一個示例HTML:
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title>Onecode</title>
</head>
<body>
<form id="form1" runat="server">
<div>
Hi, Onecode team.
</div>
</form>
</body>
</html>
接着建立一個SearchEngine的web頁面,此頁面提供程序的主界面,擁有一個TextBox,Button和GridView控件,接收用戶輸入的關鍵字,並且返回對應的網頁信息:
HTML代碼如下:
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title></title>
</head>
<body>
<form id="form1" runat="server">
<div>
Key word:
<asp:TextBox ID="tbKeyWord" runat="server"></asp:TextBox>
<br />
<asp:Button ID="btnSearchPage" runat="server" Text="Search your web page"
οnclick="btnSearchPage_Click" />
<asp:GridView ID="gvwResource" runat="server" AutoGenerateColumns="False">
<Columns>
<asp:BoundField DataField="Title" HeaderText="Page Name" />
<asp:HyperLinkField DataNavigateUrlFields="Link" DataTextField="Link"
HeaderText="Page URL" />
</Columns>
<EmptyDataTemplate>
No result
</EmptyDataTemplate>
</asp:GridView>
</div>
</form>
</body>
</html>
我們需要建立一個WebPage的實體類存儲有關的網頁信息,並且便於Linq的查詢和數據綁定,創建一個類文件,命名爲WebPageEntity.cs
C#代碼,這裏只存儲了最簡單信息(網頁名稱,內容(HTML),鏈接,標題,內容(文本)):
/// <summary>
/// web page entity class, contain page's basic information,
/// such as name, content, link, title, body text.
/// </summary>
[Serializable]
public class WebPageEntity
{
private string name;
private string content;
private string link;
private string title;
private string body;
public string Name
{
get
{
return name;
}
set
{
name = value;
}
}
public string Content
{
get
{
return content;
}
set
{
content = value;
}
}
public string Link
{
get
{
return link;
}
set
{
link = value;
}
}
public string Title
{
get
{
return title;
}
set
{
title = value;
}
}
public string Body
{
get
{
return body;
}
set
{
body = value;
}
}
}
創建一個RegexMethod類,包含提取網頁標題,內容的方法,你可以選擇擴展這個類,建立自己獨有的搜索和排序方法:
代碼(RegexMethod.cs)
public class RegexMethod
{
/// <summary>
/// The method is use to retrieve title text of pages.
/// </summary>
/// <param name="htmlCode"></param>
/// <returns></returns>
public string GetTitleString(string htmlCode)
{
string regexTitle = @"<title>([^<]*)</title>";
string tagClean = @"<[^>]*>";
Match match = Regex.Match(htmlCode, regexTitle, RegexOptions.IgnoreCase);
string text = match.Groups[0].Value.ToString();
string titleText = Regex.Replace(match.Value, tagClean, string.Empty, RegexOptions.IgnoreCase);
return titleText;
}
/// <summary>
/// The method is use to retrieve body text of pages.
/// </summary>
/// <param name="htmlCode"></param>
/// <returns></returns>
public string GetBodyString(string htmlCode)
{
string regexBody = @"(?m)<body[^>]*>(\w|\W)*?</body[^>]*>";
string tagClean = @"<[^>]*>";
MatchCollection matches = Regex.Matches(htmlCode, regexBody, RegexOptions.IgnoreCase);
StringBuilder strPureText = new StringBuilder();
foreach (Match match in matches)
{
string text = Regex.Replace(match.Value, tagClean, string.Empty, RegexOptions.IgnoreCase);
strPureText.Append(text);
}
return strPureText.ToString();
}
}
準備工作已經OK,我們開始在SearchEngine.aspx.cs頁面寫主要的實現方法了,思想是這樣的,首先建立一個List<T>實例,獲取網頁的資源信息,爲了保持回傳的狀態,將這個List保留在ViewState中,獲取網頁資源將用到HttpWebRequest和HttpWebResponse類,並且用lock關鍵字定義互斥段代碼。實體類中Name和Link用於展示網頁名稱和鏈接,允許用戶通過點擊訪問網頁,Title和Body作爲搜索條件,Content用於通過RegexMethod class截取Title和Body。當收取網頁實體類完成後(注意這裏我們也可以收集外部網站的內容,同樣可以藉助我們的方法來執行搜索,這裏我加入Bing網站,www.bing.com),是信息篩選階段,使用Linq的Contain方法判斷標題和網頁內容是否包含對應的關鍵字,如果符合加入到選中的list中,並顯示出來:
全部代碼如下(Default.aspx.cs)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
namespace CSASPNETDisplayDataStreamResource
{
public partial class SearchEngine : System.Web.UI.Page
{
private List<WebPageEntity> webResources;
private bool isLoad = true;
protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
this.LoadList();
}
}
/// <summary>
/// Store web resources in ViewState variables.
/// </summary>
public List<WebPageEntity> WebResources
{
get
{
if (ViewState["Resource"] != null)
{
this.LoadList();
}
return (List<WebPageEntity>)ViewState["Resource"];
}
}
/// <summary>
/// The method is use to load resource by specifically web pages.
/// </summary>
public void LoadList()
{
RegexMethod method = new RegexMethod();
webResources = new List<WebPageEntity>();
lock (this)
{
for (int i = 0; i < 10; i++)
{
string url = Page.Request.Url.ToString().Replace("SearchEngine", string.Format("WebPage{0}", i));
string result = this.LoadResource(url);
if (isLoad)
{
WebPageEntity webEntity = new WebPageEntity();
webEntity.Name = Path.GetFileName(url);
webEntity.Link = url;
webEntity.Content = result;
webEntity.Title = method.GetTitleString(result);
webEntity.Body = method.GetBodyString(result);
webResources.Add(webEntity);
}
}
string extraUrl = "http://www.bing.com/";
string bingResult = this.LoadResource(extraUrl);
if (isLoad)
{
WebPageEntity webEntity = new WebPageEntity();
webEntity.Name = Path.GetFileName(extraUrl);
webEntity.Link = extraUrl;
webEntity.Content = bingResult;
webEntity.Title = method.GetTitleString(bingResult);
webEntity.Body = method.GetBodyString(bingResult);
webResources.Add(webEntity);
}
ViewState["Resource"] = webResources;
}
}
/// <summary>
/// Use HttpWebRequest, HttpWebResponse, StreamReader for retrieving
/// information of pages, and calling Regex methods to get useful
/// information.
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public string LoadResource(string url)
{
HttpWebResponse webResponse = null;
StreamReader reader = null;
try
{
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);
webRequest.Timeout = 30000;
webResponse = (HttpWebResponse)webRequest.GetResponse();
string resource = String.Empty;
if (webResponse == null)
{
this.isLoad = false;
return string.Empty;
}
else if (webResponse.StatusCode != HttpStatusCode.OK)
{
this.isLoad = false;
return string.Empty;
}
else
{
reader = new StreamReader(webResponse.GetResponseStream(), Encoding.GetEncoding("utf-8"));
resource = reader.ReadToEnd();
return resource;
}
}
catch (Exception ex)
{
this.isLoad = false;
return ex.Message;
}
finally
{
if (webResponse != null)
{
webResponse.Close();
}
if (reader != null)
{
reader.Close();
}
}
}
/// <summary>
/// The search button click event is use to compare key words and
/// page resources for selecting relative pages.
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
protected void btnSearchPage_Click(object sender, EventArgs e)
{
if (!isLoad)
{
Response.Write("Resource file load failed, please refresh your page.");
return;
}
if (tbKeyWord.Text.Trim() != string.Empty)
{
List<WebPageEntity> allSelectedResources = new List<WebPageEntity>();
string[] keys = tbKeyWord.Text.Split(' ');
foreach(string key in keys)
{
string oneKey = key;
var webSelectedResources = from entity in this.WebResources
where entity.Body.ToLower().Contains(string.Format("{0}", oneKey.ToLower()))
|| entity.Title.ToLower().Contains(string.Format("{0}", oneKey.ToLower()))
select entity;
foreach (WebPageEntity entity in webSelectedResources)
{
if (!allSelectedResources.Contains(entity))
{
allSelectedResources.Add(entity);
}
}
}
gvwResource.DataSource = allSelectedResources;
gvwResource.DataBind();
}
else
{
var webSelectedResource = from entity in this.WebResources
select new
{
entity.Title,
entity.Link,
};
gvwResource.DataSource = webSelectedResource;
gvwResource.DataBind();
}
}
}
}
請按Ctrl+F5嘗試運行你的網站,輸入你的關鍵字開始搜索吧,比如onecode,bing,azure,hotmail等等。