文章詳情頁

半小時實現(xiàn)Java手擼網絡爬蟲框架(附完整源碼)

瀏覽：205日期：2022-08-09 15:38:57

最近在做一個搜索相關的項目，需要爬取網絡上的一些鏈接存儲到索引庫中，雖然有很多開源的強大的爬蟲框架，但本著學習的態(tài)度，自己寫了一個簡單的網絡爬蟲，以便了解其中的原理。今天，就為小伙伴們分享下這個簡單的爬蟲程序！！

首先介紹每個類的功能：

DownloadPage.java的功能是下載此超鏈接的頁面源代碼. FunctionUtils.java 的功能是提供不同的靜態(tài)方法，包括：頁面鏈接正則表達式匹配,獲取URL鏈接的元素,判斷是否創(chuàng)建文件,獲取頁面的Url并將其轉換為規(guī)范的Url,截取網頁網頁源文件的目標內容。 HrefOfPage.java 的功能是獲取頁面源代碼的超鏈接。 UrlDataHanding.java 的功能是整合各個給類，實現(xiàn)url到獲取數(shù)據(jù)到數(shù)據(jù)處理類。 UrlQueue.java 的未訪問Url隊列。 VisitedUrlQueue.java 已訪問過的URL隊列。

下面介紹一下每個類的源代碼：

DownloadPage.java 此類要用到HttpClient組件。

package com.sreach.spider;import java.io.IOException;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;/** * @author binghe */public class DownloadPage { /** * 根據(jù)URL抓取網頁內容 * * @param url * @return */ public static String getContentFormUrl(String url) { /* 實例化一個HttpClient客戶端 */ HttpClient client = new DefaultHttpClient(); HttpGet getHttp = new HttpGet(url); String content = null; HttpResponse response; try { /* 獲得信息載體 */ response = client.execute(getHttp); HttpEntity entity = response.getEntity(); VisitedUrlQueue.addElem(url); if (entity != null) { /* 轉化為文本信息 */ content = EntityUtils.toString(entity); /* 判斷是否符合下載網頁源代碼到本地的條件 */ if (FunctionUtils.isCreateFile(url) && FunctionUtils.isHasGoalContent(content) != -1) { FunctionUtils.createFile( FunctionUtils.getGoalContent(content), url); } } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { client.getConnectionManager().shutdown(); } return content; }}

FunctionUtils.java 此類的方法均為static方法

package com.sreach.spider;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * @author binghe */public class FunctionUtils { /** * 匹配超鏈接的正則表達式 */ private static String pat = 'http://www.oschina.net/code/explore/.*/w+.[a-zA-Z]+'; private static Pattern pattern = Pattern.compile(pat); private static BufferedWriter writer = null; /** * 爬蟲搜索深度 */ public static int depth = 0; /** * 以'/'來分割URL,獲得超鏈接的元素 * * @param url * @return */ public static String[] divUrl(String url) { return url.split('/'); } /** * 判斷是否創(chuàng)建文件 * * @param url * @return */ public static boolean isCreateFile(String url) { Matcher matcher = pattern.matcher(url); return matcher.matches(); } /** * 創(chuàng)建對應文件 * * @param content * @param urlPath */ public static void createFile(String content, String urlPath) { /* 分割url */ String[] elems = divUrl(urlPath); StringBuffer path = new StringBuffer(); File file = null; for (int i = 1; i < elems.length; i++) { if (i != elems.length - 1) { path.append(elems[i]); path.append(File.separator); file = new File('D:' + File.separator + path.toString()); } if (i == elems.length - 1) { Pattern pattern = Pattern.compile('w+.[a-zA-Z]+'); Matcher matcher = pattern.matcher(elems[i]); if ((matcher.matches())) { if (!file.exists()) { file.mkdirs(); } String[] fileName = elems[i].split('.'); file = new File('D:' + File.separator + path.toString() + File.separator + fileName[0] + '.txt'); try { file.createNewFile(); writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file))); writer.write(content); writer.flush(); writer.close(); System.out.println('創(chuàng)建文件成功'); } catch (IOException e) { e.printStackTrace(); } } } } } /** * 獲取頁面的超鏈接并將其轉換為正式的A標簽 * * @param href * @return */ public static String getHrefOfInOut(String href) { /* 內外部鏈接最終轉化為完整的鏈接格式 */ String resultHref = null; /* 判斷是否為外部鏈接 */ if (href.startsWith('http://')) { resultHref = href; } else { /* 如果是內部鏈接,則補充完整的鏈接地址,其他的格式忽略不處理,如：a href='http://www.b3g6.com/bcjs/4429.html#' rel='external nofollow' */ if (href.startsWith('/')) { result + href; } } return resultHref; } /** * 截取網頁網頁源文件的目標內容 * * @param content * @return */ public static String getGoalContent(String content) { int sign = content.indexOf('<pre class=''); String signContent = content.substring(sign); int start = signContent.indexOf('>'); int end = signContent.indexOf('</pre>'); return signContent.substring(start + 1, end); } /** * 檢查網頁源文件中是否有目標文件 * * @param content * @return */ public static int isHasGoalContent(String content) { return content.indexOf('<pre class=''); }}

HrefOfPage.java 此類為獲取頁面的超鏈接

package com.sreach.spider;/** * @author binghe * */public class HrefOfPage { /** * 獲得頁面源代碼中超鏈接 */ public static void getHrefOfContent(String content) { System.out.println('開始'); String[] contents = content.split('<a href='http://www.b3g6.com/bcjs/4429.html'); for (int i = 1; i < contents.length; i++) { int endHref = contents[i].indexOf('''); String aHref = FunctionUtils.getHrefOfInOut(contents[i].substring( 0, endHref)); if (aHref != null) { String href = FunctionUtils.getHrefOfInOut(aHref); if (!UrlQueue.isContains(href) && href.indexOf('/code/explore') != -1 && !VisitedUrlQueue.isContains(href)) { UrlQueue.addElem(href); } } } System.out.println(UrlQueue.size() + '--抓取到的連接數(shù)'); System.out.println(VisitedUrlQueue.size() + '--已處理的頁面數(shù)'); }}

UrlDataHanding.java 此類主要是從未訪問隊列中獲取url,下載頁面，分析url，保存已訪問url等操作，實現(xiàn)Runnable接口

package com.sreach.spider;/** * @author binghe * */public class UrlDataHanding implements Runnable { /** * 下載對應頁面并分析出頁面對應的URL放在未訪問隊列中。 * * @param url */ public void dataHanding(String url) { HrefOfPage.getHrefOfContent(DownloadPage.getContentFormUrl(url)); } public void run() { while (!UrlQueue.isEmpty()) { dataHanding(UrlQueue.outElem()); } }}

UrlQueue.java 此類主要是用來存放未訪問的URL隊列

package com.sreach.spider;import java.util.LinkedList;/** * @author binghe * */public class UrlQueue { /** 超鏈接隊列 */ public static LinkedList<String> urlQueue = new LinkedList<String>(); /** 隊列中對應最多的超鏈接數(shù)量 */ public static final int MAX_SIZE = 10000; public synchronized static void addElem(String url) { urlQueue.add(url); } public synchronized static String outElem() { return urlQueue.removeFirst(); } public synchronized static boolean isEmpty() { return urlQueue.isEmpty(); } public static int size() { return urlQueue.size(); } public static boolean isContains(String url) { return urlQueue.contains(url); }}

VisitedUrlQueue.java 主要是保存已訪問過的URL，使用HashSet來保存，主要是考慮到每個訪問過的URL是不同。HashSet剛好符合這個要求

package com.sreach.spider;import java.util.HashSet;/** * 已訪問url隊列 * @author binghe * */public class VisitedUrlQueue { public static HashSet<String> visitedUrlQueue = new HashSet<String>(); public synchronized static void addElem(String url) { visitedUrlQueue.add(url); } public synchronized static boolean isContains(String url) { return visitedUrlQueue.contains(url); } public synchronized static int size() { return visitedUrlQueue.size(); }}

Test.java 此類為測試類

import java.sql.SQLException;import com.sreach.spider.UrlDataHanding;import com.sreach.spider.UrlQueue;/** * @author binghe * */public class Test { public static void main(String[] args) throws SQLException { String url = 'http://www.oschina.net/code/explore/achartengine/client/AndroidManifest.xml'; String url1 = 'http://www.oschina.net/code/explore'; String url2 = 'http://www.oschina.net/code/explore/achartengine'; String url3 = 'http://www.oschina.net/code/explore/achartengine/client'; UrlQueue.addElem(url); UrlQueue.addElem(url1); UrlQueue.addElem(url2); UrlQueue.addElem(url3); UrlDataHanding[] url_Handings = new UrlDataHanding[10]; for (int i = 0; i < 10; i++) { url_Handings[i] = new UrlDataHanding(); new Thread(url_Handings[i]).start(); } }}

說明一下：由于我抓取的是針對oschina的，所以里面的url正則表達式不適合其他網站，需要自己修改一下。你也可以寫成xml來配置。

到此這篇關于半小時實現(xiàn)Java手擼網絡爬蟲框架(附完整源碼)的文章就介紹到這了,更多相關Java 網絡爬蟲框架內容請搜索好吧啦網以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持好吧啦網！

Java

上一條：詳解Java中CountDownLatch異步轉同步工具類下一條：淺談Java中FastJson的使用

相關文章：

1. IntelliJ IDEA恢復刪除文件的方法2. IntelliJ IDEA配置Tomcat服務器的方法3. docker鏡像完全卸載的操作步驟4. 使用Maven 搭建 Spring MVC 本地部署Tomcat的詳細教程5. idea刪除項目的操作方法6. IntelliJ IDEA設置默認瀏覽器的方法7. IntelliJ IDEA導入jar包的方法8. idea導入maven項目的方法9. idea重置默認配置的方法步驟10. Docker 部署 Prometheus的安裝詳細教程

排行榜

					
					使用Maven 搭建 Spring MVC 本地部署Tomcat的詳細教程
IntelliJ IDEA恢復刪除文件的方法
docker鏡像完全卸載的操作步驟
IntelliJ IDEA設置默認瀏覽器的方法
IntelliJ IDEA配置Tomcat服務器的方法
IntelliJ IDEA導入jar包的方法
idea刪除項目的操作方法
Docker 部署 Prometheus的安裝詳細教程
idea導入maven項目的方法
idea重置默認配置的方法步驟
IntelliJ IDEA調整字體大小的方法