1.根據CSDN文章類型獲取對應類型的文章鏈接
public void searchCsdnUrl() throws IOException {
// String[] Arr = {"ai","cloud", "db","career","game", "engineering","web",
// "mobile", "iot","ops","fund", "lang", "arch", "avi", "sec","other"};
List<String> urlList=new ArrayList<>();
urlList.add("web");
urlList.add("ai");
urlList.add("cloud");
urlList.add("db");
urlList.add("fund");
urlList.add("career");
urlList.add("game");
urlList.add("engineering");
urlList.add("mobile");urlList.add("sec");
urlList.add("iot");urlList.add("lang");urlList.add("arch");
urlList.add("ops");urlList.add("avi");urlList.add("other");
for(String type:urlList){
String url="https://www.csdn.net/nav/"+type;
//獲取url地址的http鏈接Connection
Connection conn = Jsoup.connect(url) //博客首頁的url地址
.userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
.timeout(1000) //http連接時長
.method(Connection.Method.GET); //請求類型是get請求,http請求還是post,delete等方式
//獲取頁面的html文檔
Document doc = conn.get();
Element body = doc.body();
//將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
List<Article> resultList = new ArrayList<Article>(100);
Element articleListDiv = body.getElementById("feedlist_id");
Elements articleList = articleListDiv.getElementsByClass("clearfix");
for(Element article : articleList){
Article articleEntity = new Article();
//標題
Element linkNode = (article.select("div h2 a")).get(0);
//文章簡介
Element desptionNode = (article.getElementsByClass("summary oneline")).get(0);
//時間
Element articleManageNode = (article.getElementsByClass("time")).get(0);
//閱讀量
Element readNum = (article.getElementsByClass("read_num")).get(0);
Element commentNum = (article.getElementsByClass("common_num ")).get(0);
//文章url
articleEntity.setAddress(linkNode.attr("href"));
articleEntity.setTitle(linkNode.text());
articleEntity.setDesption(desptionNode.text());
//articleEntity.setTime(new Date());
if ("".equals(readNum.getElementsByClass("num").text())) {
articleEntity.setCommentNum(0);
}else {
articleEntity.setReadNum(Integer.parseInt(readNum.getElementsByClass("num").text()));
}
if ("".equals(commentNum.getElementsByClass("num").text())) {
articleEntity.setCommentNum(0);
}else {
articleEntity.setCommentNum(Integer.parseInt(commentNum.getElementsByClass("num").text()));
}
articleEntity.setStatus(0);
articleEntity.setType(type);
articleEntity.setTime(new Date());
articleEntity.setBlogType("CSDN");
//將閱讀量大於100的url存儲到數據庫
if(articleEntity.getReadNum()>100){
articleDao.save(articleEntity);
}
System.out.println("文章閱讀數+++++++++++:" + articleEntity.getReadNum());
}
//遍歷輸出ArrayList裏面的爬取到的文章
System.out.println("文章總數++++++++++++:" + articleList.size());
System.out.println("文章絕對路勁地址:http://blog.csdn.net" + "++++++++++++++++++++++++");
}
}
//通過CSDN文章url獲取文章內容
public String searchCsdnUrl(String postUrl) throws IOException {
//獲取url地址的http鏈接Connection
Connection conn = Jsoup.connect(postUrl) //博客首頁的url地址
.userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
.timeout(1000) //http連接時長
.method(Connection.Method.GET); //請求類型是get請求,http請求還是post,delete等方式
//獲取頁面的html文檔
Document doc = conn.get();
Element body = doc.body();
//將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
List<Article> resultList = new ArrayList<Article>(100);
Element articleListDiv = body.getElementById("content_views");
String html="<!DOCTYPE html>\n" +
"<html>\n" +
"<body>"+articleListDiv.outerHtml();
html+= "<br/>作者原文鏈接:"+"<p><a href=\""+postUrl+"\" target=\"_blank\" rel=\"noopener\">"+postUrl+"</a></p>\n" +
"</body>\n" +
"</html>";
//遍歷輸出ArrayList裏面的爬取到的文章
System.out.println("文章總數:" + postUrl);
return html;
}
2.根據OSC文章類型獲取OSC文章鏈接
public void saveOSCUrl() throws IOException {
List<String> urlList=new ArrayList<>();
urlList.add("5611447");//ai
urlList.add("5593654");//cloud
urlList.add("428639");//cloud
urlList.add("5765988");//blockchain
urlList.add("428602");//mobile
urlList.add("428612");//web
urlList.add("428640");//engineering
urlList.add("429511");//game
urlList.add("428609");//lang
urlList.add("428610");//db
urlList.add("428613");//ops
urlList.add("6289115");//iot
urlList.add("428647");//avi
urlList.add("430381");//other
urlList.add("428638");//arch
for(String type:urlList){
String url="https://www.oschina.net/blog?classification=";
String relUrl=url+type;
//獲取url地址的http鏈接Connection
Connection conn = Jsoup.connect(relUrl) //博客首頁的url地址
.userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
.timeout(1000) //http連接時長
.method(Connection.Method.GET); //請求類型是get請求,http請求還是post,delete等方式
//獲取頁面的html文檔
Document doc = conn.get();
Element body = doc.body();
//將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
List<Article> resultList = new ArrayList<Article>(100);
Element articleListDiv = body.getElementById("recommendArticleList");
Elements articleList = articleListDiv.getElementsByClass("content");
for(Element article : articleList){
Article articleEntity = new Article();
//標題
Element linkNode = (article.select("div a")).get(0);
//文章簡介
Element desptionNode = (article.getElementsByClass("line-clamp")).get(0);
//Element articleManageNode = (article.getElementsByClass("time")).get(0);
String readNum = (article.getElementsByClass("item")).get(3).text();
Element commentNum = (article.getElementsByClass("item")).get(4);
articleEntity.setAddress(linkNode.attr("href"));
articleEntity.setTitle(linkNode.text());
articleEntity.setDesption(desptionNode.text());
// articleEntity.setTime(articleManageNode.select("span:eq(0").text());
//閱讀量
if("".equals(readNum)){
articleEntity.setReadNum(0);
}else {
if(readNum.contains("K")){
String rNum=readNum.replaceAll("K","");
float redNum=Float.parseFloat(rNum);
double readNums=redNum*1000;
articleEntity.setReadNum((int)readNums);
}else {
articleEntity.setReadNum(Integer.valueOf(readNum));
}
}
//評論
if("".equals(commentNum.text())){
articleEntity.setCommentNum(0);
}else {
articleEntity.setCommentNum(Integer.parseInt(commentNum.text()));
}
if("5611447".equals(type)){articleEntity.setType("ai");}
if("5593654".equals(type)){articleEntity.setType("cloud");}
if("428639".equals(type)){articleEntity.setType("cloud");}
if("5765988".equals(type)){articleEntity.setType("blockchain");}
if("428602".equals(type)){articleEntity.setType("mobile");}
if("428612".equals(type)){articleEntity.setType("web");}
if("428640".equals(type)){articleEntity.setType("engineering");}
if("429511".equals(type)){articleEntity.setType("game");}
if("428609".equals(type)){articleEntity.setType("lang");}
if("428610".equals(type)){articleEntity.setType("db");}
if("428613".equals(type)){articleEntity.setType("ops");}
if("6289115".equals(type)){articleEntity.setType("iot");}
if("428647".equals(type)){articleEntity.setType("avi");}
if("430381".equals(type)){articleEntity.setType("other");}
if("428638".equals(type)){articleEntity.setType("arch");}
articleEntity.setTime(new Date());
articleEntity.setBlogType("OSC");
articleEntity.setStatus(0);
// if(articleEntity.getReadNum()>50){
articleDao.save(articleEntity);
// }
System.out.println("文章閱讀數++++++=:" + articleEntity.getReadNum());
System.out.println("文章總數++=:" + articleEntity.getAddress());
System.out.println("文章類型+++:" + articleEntity.getType());
}
//遍歷輸出ArrayList裏面的爬取到的文章
System.out.println("文章總數:" + articleList.size());
System.out.println("文章絕對路勁地址:oooooooooooooooo++++++++++++++++++++++++++++++");
}
}
//通過OSC文章鏈接獲取博客內容
public String searchOscUrl(String postUrl) throws IOException {
//獲取url地址的http鏈接Connection
Connection conn = Jsoup.connect(postUrl) //博客首頁的url地址
.userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
.timeout(1000) //http連接時長
.method(Connection.Method.GET); //請求類型是get請求,http請求還是post,delete等方式
//獲取頁面的html文檔
Document doc = conn.get();
Element body = doc.body();
//將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
Elements articleListDiv = body.getElementById("articleContent").children();
articleListDiv.remove(0);
String html="<!DOCTYPE html>\n" +
"<html>\n" +
"<body>"+"<div class=\"htmledit_views\" id=\"content_views\"> "+articleListDiv.outerHtml()+"</div>";
html+= "<br/>作者原文鏈接:"+"<p><a href=\""+postUrl+"\" target=\"_blank\" rel=\"noopener\">"+postUrl+"</a></p>\n" +
"</body>\n" +
"</html>";
//遍歷輸出ArrayList裏面的爬取到的文章
System.out.println("文章總數:" + postUrl);
return html;
}
3.公共bean
/**
* 文章的JavaBean.
* date:2017-02-09
*/
@Entity
@Table(name = "mto_csdn")
@Cache(usage = CacheConcurrencyStrategy.READ_WRITE)
public class Article implements Serializable {
private static final long serialVersionUID = -1153854616385727165L;
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
@SortableField
@NumericField
private long id;
/**
* 文章鏈接的相對地址
*/
private String address;
/**
* 文章標題
*/
private String title;
/**
* 文章簡介
*/
private String desption;
/**
* 文章發表時間
*/
@Temporal(value = TemporalType.TIMESTAMP)
private Date time;
/**
* 閱讀數
*/
@Column(name = "read_num")
private int readNum;
/**
* 評論數
*/
@Column(name = "comment_num")
private int commentNum;
@Column(name = "type")
private String type;
@Column(name = "status")
private int status;
@Column(name = "blog_type")
private String blogType;
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public int getReadNum() {
return readNum;
}
public void setReadNum(int readNum) {
this.readNum = readNum;
}
public int getCommentNum() {
return commentNum;
}
public void setCommentNum(int commentNum) {
this.commentNum = commentNum;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDesption() {
return desption;
}
public void setDesption(String desption) {
this.desption = desption;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public String getBlogType() {
return blogType;
}
public void setBlogType(String blogType) {
this.blogType = blogType;
}
}