TA的每日心情 | 开心 2021-3-12 23:18 |
|---|
签到天数: 2 天 [LV.1]初来乍到
|
|
功能:给定一URL,自动搜索页面下的图片,遇到url自动记录,下载图片,重复N层,退出。 HTTP.java:主要代码和逻辑都在这个里头,DOWNPIC是负责下载图片的,呵呵,用了十个线程并发去跑。
- 一:记录URL及它所处的层次
- public class URLObj
- {
- private String urlString = null;
- private int index = -1;
-
- public URLObj(String url , int index)
- {
- this.urlString = url;
- this.index = index;
- }
-
- public String getUrlString(){
- return urlString;
- }
- public int getIndex(){
- return index;
- }
- public void setUrlString(String urlString){
- this.urlString=urlString;
- }
- public void setIndex(int index){
- this.index=index;
- }
- }
- 二、根据图片的绝对URL地址下载该图片
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.io.*;
-
- public class URLDownPic
- {
- URL url;
-
- public void down(String imgUrl)
- {
- System.out.println("down......"+imgUrl);
- String fileName = imgUrl.substring(imgUrl.lastIndexOf("/")+1,imgUrl.length());
- try {
- url = new URL(imgUrl);
- } catch (MalformedURLException e) {
- System.out.println(imgUrl+"异常,无法下载");
- e.printStackTrace();
- }
-
- InputStream is = null;
- try {
- is = url.openStream();
- }catch (IOException e) {
- e.printStackTrace();
- System.out.println(imgUrl+"没有下载成功1");
- return;
- }
- OutputStream os = null;
- File f = new File("c:\webimg");
- f.mkdirs();
- try{
- os = new FileOutputStream("c:\webimg"+fileName);
- int bytesRead = 0;
- byte[] buffer = new byte[8192];
- while((bytesRead = is.read(buffer,0,8192))!=-1){
- os.write(buffer,0,bytesRead);
- }
- }catch(FileNotFoundException e){
- e.printStackTrace();
- System.out.println(imgUrl+"没有下载成功2");
- } catch(IOException e) {
- e.printStackTrace();
- System.out.println(imgUrl+"没有下载成功3");
- }
-
- }
- public static void main(String args[]){
- new URLDownPic().down("http://www.baidu.com/logo.gif");
- }
-
- }
- 三、处理主要逻辑,打开一个网页,提取里面和网址和图片地址等
- import java.io.BufferedReader;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.net.URI;
- import java.net.MalformedURLException;
- import java.net.URISyntaxException;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- public class HTTP implements Runnable
- {
- StringBuffer textStringBuffer;
- public static HashSet
-
- allUrlSet; //这两个hashSet是作为是否有重复URL检测用
- public static HashSet
-
- allPicSet;
-
- public static ArrayList
-
- allUrList; //全局变量
-
- HashSet
-
- curPagePicSet; //当前页面的图片
-
- //正则表达式,前半是网址,后半是JPG图像,两者得其一就是匹配,
- //这个正则只提取单引号或双引号中的图片或网址链接,网上有好些的,可以改进。
- String patternStrs=
- "(< a\s*href=[""](.*?)[""].*?>)|(?:(src|SRC|background|BACKGROUND)=[""](.*?)[""].*?>)";
-
- public String urlString =null;
-
- URLDownPic downPic= null; //下载工具类
-
- /** 搜索的深度,如果为1就只搜索当前页面的所有图片 */
- public static int searchDepth = 1;
-
- /** 线程如果发现没有队列中没有URL,就睡眠一次,睡N次后,就中止线程 */
- int sleepTimes = 0;
- int sleepMaxTime = 10;
-
- public HTTP(String url) //构造函数
- {
- this();
- urlString = url;
- }
-
- public HTTP()
- {
- downPic = new URLDownPic();
- textStringBuffer = new StringBuffer();
- if (allUrlSet== null)
- {
- allUrlSet = new HashSet< String>();
- }
- if (allPicSet == null)
- {
- allPicSet = new HashSet< String>();
- }
- if (allUrList ==null)
- {
- allUrList = new ArrayList< URLObj>();
- }
- curPagePicSet = new HashSet< String>();
- }
- public String getAbsoluteURL(String baseURI, String relativePath){ //由相对URL,得到绝对URL
- //System.out.println("baseURI="+baseURI);
- //System.out.println("相对路径="+relativePath);
- String abURL=null;
- try {
- URI base=new URI(baseURI);//如:baseURI="http://hi.baidu.com/zeiysiufehbbeqq/"
- URI abs=base.resolve(relativePath);//解析网页的相对URL,得到绝对URI
- URL absURL=abs.toURL();//转成URL
- //System.out.println(absURL);
- abURL = absURL.toString();
- } catch (MalformedURLException e) {
- System.out.println("相对路径转绝对路径时出错1!");
- e.printStackTrace();
- } catch (URISyntaxException e) {
- e.printStackTrace();
- System.out.println("相对路径转绝对路径时出错2!");
- } finally{
- return abURL;
- }
- }
- public String getText(String url)
- {
- // System.out.println("处理文本="+url);
- try
- {
- String urlName = url;
- URL U = new URL(urlName);
-
- URLConnection connection = U.openConnection();
- connection.setConnectTimeout(2000);
- connection.connect();
-
- BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
- String line;
- while ((line = in.readLine()) != null)
- {
- textStringBuffer.append(line);
-
- }
-
- in.close();
- return textStringBuffer.toString();
- }
- catch (Exception e)
- {
- System.out.println("从"+url+"处获取文本没有结果!");
- e.printStackTrace();
- }
- return null;
- }
-
- public static String getBaseURI(URLObj urlObj){
-
- String urlString=urlObj.getUrlString();
- int i=urlString.lastIndexOf("/");
- if(i==6) return urlString+"/";
- else
- return urlString.substring(0,i+1);
- }
-
- public void go(URLObj urlObj)
-
- {
- System.out.println("处理网页="+urlObj.getUrlString());
- //得到当前网页的baseURI
- String baseURI=getBaseURI(urlObj);
-
- //根据url得到HTTP流
- String httpString = getText(urlObj.getUrlString());
-
- //跟据正则表达式获得网址,一类是图片,一类是网页网址,递归找啊!
- Pattern p=Pattern.compile(patternStrs);
- if (p == null) //没有找到就返回
- {
- return;
- }
- Matcher m=p.matcher(httpString);
-
- while (m!=null && m.find())
- {
- //找到的有可能是重复的,有可能是别的网页已经下过的,就跳过,
- //否则加入HASHSET,下面进行下载和递归搜索
- byte stringType=-1; //该字符串种类,1是网页URL,2是JPG图片
- String tempS = null;
- if (m.group(2)!=null) //这个是URL
- {
- tempS= m.group(2);
- //这里只搜索下面这些扩展名结尾的,省略了好多,
- //用网址直接给出的图片也省了,如: href="http://www.baidu.com/aa/1.gif"之类
- if(!tempS.matches(".*(?i)(\.(cn)$|\.(com)$|\.(net)$|\.(org)$|\.(htm)$|
- \.(html)$|\.(jsp)$|\.(asp)$|\.(php)$|\.(aspx)$)"))
- continue;
- stringType = 1;
-
- }
- else
- {
- tempS= m.group(4);
- //只下载下面扩展名结尾的
- if(!tempS.matches(".*(?i)(\.(gif)$|\.(jpg)$|\.(png)$|\.(jpeg)$|\.(bmp)$)"))
- continue;
- stringType = 2;
-
- }
-
- //如果是相对路径就转成绝对路径
- if (!tempS.startsWith("http"))
- {
-
- tempS = getAbsoluteURL(baseURI,tempS);
- }
- // System.out.println("找到="+tempS);
- switch (stringType)
- {
- case 1:
- if (allUrlSet.contains(tempS)) //判断是否登录过此网页
- continue;
- if (urlObj.getIndex()>searchDepth) //是否超过了应爬的深度
- continue;
-
- //加锁,防止添加时有别的线程删除
- synchronized (allUrList)
- {
- allUrList.add(new URLObj(tempS,urlObj.getIndex()+1));
- }
- allUrlSet.add(tempS); //记录此页面已登记过
- break;
- case 2:
- if (!allPicSet.contains(tempS))
- {
- curPagePicSet.add(tempS);
- allPicSet.add(tempS);
- }
- break;
- default:
- break;
- }
-
-
- }
-
- //疯狂下图。。。
- Iterator
-
- iterator = curPagePicSet.iterator();
- while (iterator.hasNext())
- {
- String imgUrl = iterator.next();
- downPic.down(imgUrl);
- }
-
-
- }
-
-
- @Override
- public void run()
- {
- boolean isHasNewUrl = false;
-
- while (true)
- {
- try
- {
- if (allUrList.size()>0)
- {
- URLObj urlObj = null;
- synchronized (allUrList)
- {
- if (allUrList.size()>0)
- {
- urlObj = allUrList.get(0);
- allUrList.remove(0);
- isHasNewUrl= true;
- }
- }
- if (isHasNewUrl)
- {
- go(urlObj);
- }
- }
- else
- {
- Thread.sleep(500);
- sleepTimes++;
- if (sleepTimes>sleepMaxTime)
- {
- break;
- }
- }
- }
- catch (Exception e)
- {
- System.out.println("线程运行时发生错误");
- e.printStackTrace();
- }
- }
-
-
- }
-
- public static void main(String args[])
- {
- String urlString ="http://www.baidu.com";//必须以http打头
- allUrList = new ArrayList< URLObj>(); //将要爬的网页放在这里,绝对地址
- allUrlSet = new HashSet< String>();
- allUrList.add(new URLObj(urlString,1));
- allUrlSet.add(urlString); //已经爬过的网页放在这里
-
- Thread[] http = new Thread[10];
-
- for (int i = 0; i < http.length; i++)
- {
- http[i] = new Thread(new HTTP());
- http[i].start();
- }
- }
-
- }
-
-
-
-
-
-
复制代码
源码下载:http://file.javaxxz.com/2014/11/3/000200859.zip |
|