最近天氣很熱,想想這麼高的溫度在全國應該排的上號的吧,在全國氣溫排行的話能排多少呢? 打開手機天氣app 沒找到排行功能。。。。 打開網頁搜索一下相關網站,是我打開方式不對? 點了幾個做天氣相關的網站,有幾個有高溫前十排行,大西安竟然沒上前十榜單。這把人熱成狗的溫度沒進前十 肯定也是第十一,但找了好幾個網站都沒找到全國溫度排行 這類的功能。
好吧,自己弄一個。
最粗暴的想法,找一個氣象網站,給它發http get請求,把收到的數據包處理,抓出其中的各個城市對應的頁面資源名稱
such as -》 /weather/BeiJing.html /weather/ShangHai.html
然後把全國的的這些城市都抓出來,再循環對每一個城市對應的頁面 想服務器發送http請求
such as-》 Get /BeiJing.html HTTP/1.1 Host:www.tianqi.com
然後對每一個頁面返回的數據進行處理 篩選出 城市名稱 和最高溫度,存起來 然後排一下。OK
之前只聽說過網絡爬蟲這麼個概念,這次從網上找了個c++ 的簡單例子,然後自己弄了個抓天氣數據的,說是爬蟲有點牽強吧,就爬了兩層就不爬了。。。。
#include <string>
#include <iostream>
#include <vector>
#include <time.h>
#include <queue>
#include<netdb.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<algorithm>
using namespace std;
#define MALLOC_SIZE (1024*1024)
#define DEFAULT_PAGE_BUF_SIZE (1024*1024)
class data
{
public:
data(char* city_name,char* city_degree):name(city_name),degree(city_degree)
{}
string name;
string degree;
};
bool wea_cmp(data n1,data n2)
{
return (atoi( n1.degree.c_str()) > atoi(n2.degree.c_str()));
}
//發送get請求 獲得數據
bool GetHttpResponse(string &resource, char * &response, int &bytesRead )
{
// string resource = "/quanguo";
string host = "tianqi.114la.com";
//建立socket
struct hostent *hp;
hp= gethostbyname(host.c_str() );
if( hp==NULL )
{
cout<< "Can not find host address"<<endl;
return false;
}
int sock = socket(PF_INET,SOCK_STREAM,0);
if(sock < 0)
{
printf("creat socket error\n");
}
struct sockaddr_in address;
bzero(&address,sizeof(address));
address.sin_family = AF_INET;
memcpy(&(address.sin_addr.s_addr),hp->h_addr_list[0],hp->h_length);
address.sin_port = htons(80);
int ret = connect(sock,(struct sockaddr*)&address,sizeof(address));
if(ret < 0)
{
printf("client connect error\n");
close(sock);
return false;
}
//構造 http請求
string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";
if(-1 ==send( sock, request.c_str(), request.size(), 0 ) )
{
cout << "send error" <<endl;
close( sock );
return false;
}
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
char *pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);
bytesRead = 0;
ret = 1;
while(ret > 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
if(ret > 0)
{
bytesRead += ret;
}
if( m_nContentLength - bytesRead<100)
{
cout << "\nRealloc memorry"<<endl;
m_nContentLength *=2;
pageBuf = (char*)realloc( pageBuf, m_nContentLength);
}
}
pageBuf[bytesRead] = '\0';
response = pageBuf;
close( sock );
return true;
//cout<< response <<endl;
}
//獲得某一個城市的名字和氣溫
void Get_weather_info(char *resp,vector<data> &v_weather_data)
{
char *tag1 = "<title>";
char *tag2 = "firstWeather";
char *tag3 = "red\">";
char *pos = strstr(resp,tag1);
if(pos == NULL)
{
cout<<"get city name failed"<<endl;
return;
}
pos += strlen(tag1);
char city_name[100];
char city_degree[20];
sscanf(pos,"%[^,]",city_name);
city_name[strlen(city_name)-12] = '\0';
cout<<"city_name "<<city_name<<endl;
pos = strstr(pos,tag2);
pos = strstr(pos,tag3);
pos += strlen(tag3);
sscanf(pos,"%[^<]",city_degree);
cout<<"city_degree "<<city_degree<<endl;
data city_data(city_name,city_degree);
v_weather_data.push_back(city_data);
}
//找到主頁中 全國各個城市所對應的頁面資源
void find_resource(char * resp,vector<string> &vec)
{
char *tag1 = "wordStart";
char *tag2 = "href=\"";
char *tag3 = "/YunNan/ZhaoTong";
char *pos = strstr(resp,tag1);
if(pos == NULL)
{
cout<<"cant find wordstart"<<endl;
}
char res[100];
int j = 0;
while(pos)
{
if(strncmp(pos,tag3,15) == 0)
// if(j > 100)
break;
pos = strstr(pos,tag2);
pos += strlen(tag2);
int readnum = sscanf(pos,"%[^\"]",res);
//res[readnum] = '\0';
vec.push_back(res);
j++;
}
}
int main()
{
char* resp = (char*)malloc(MALLOC_SIZE);
int byteread;
vector<string> vec;
vector<data> v_weather_data;
string res_quanguo = "/quanguo";
GetHttpResponse(res_quanguo,resp,byteread);
cout<<"byteread"<<byteread<<endl;
//獲取各個城市的頁面名稱
find_resource(resp,vec);
vector<string>::iterator iter = vec.begin();
int k = 1;
vec[126] = "/GuangDong/JiangMen.html";
for(; iter!=vec.end(); iter++)
{
cout<<k++<<" "<<*iter<<endl;
}
//獲取各個城市頁面的信息,然後找到名字和溫度 插入vec
// 多次建立拆除鏈接,效率低下
for(k=0;k<vec.size();++k)
{
memset(resp,'\0',MALLOC_SIZE);
GetHttpResponse(vec[k],resp,byteread);
Get_weather_info(resp,v_weather_data);
}
//將vec按溫度排序
sort(v_weather_data.begin(),v_weather_data.end(),wea_cmp);
vector<data>::iterator iter1 = v_weather_data.begin();
int No = 1;
for(; iter1 != v_weather_data.end(); iter1++ )
{
cout<<"No."<<No<<(*iter1).name<<" "<<(*iter1).degree<<endl;
No++;
}
free(resp);
return 0;
}
搞完跑一下 能拿到排行,但有個很大的問題,要跑上五分鐘才能出結果... 300多個城市,每一個都建立連接,獲取數據,拆除連接,好low 然後查了下 瀏覽器請求一個有很多資源的頁面時,大都用長連接,keep-alive
這樣 一個連接傳輸 多個資源,可以考慮這種方法試一下,但一次請求300個應該也費勁,(不太清楚請求報文的最大長度限制)有些web服務器 只讀取請求報文的前100字節。但一次只請求一個有些浪費了,使用長連接應該程序效率能高許多。
今天排了一下,大西安果然不負我望