不要被这个世界改变!|

爬取网站数据

这本来是个分享的,但插件不好使了,等有时间在搞一个
 
						package com.zzger.model;
  
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
  
import com.zzger.module.queue.UrlQueue;
import com.zzger.util.HttpUtils;
import com.zzger.util.RegexUtils;
  
public class WebSite {
  
    /**
     * 站点url
     */
    private String url;
      
    /**
     * 需要爬行的url队列
     */
    private UrlQueue urls = new UrlQueue<>();
      
    /**
     * 已爬行过的页面url
     */
    private List exitUrls = Collections.synchronizedList(new ArrayList<>());
      
    private static final int TOTAL_THREADS = 12;  
      
    private final CountDownLatch mStartSignal = new CountDownLatch(1);  
      
    private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS);   
      
    public WebSite(String url){
        this.url = url;
        urls.offer(url);//把网站首页加入需要爬行的队列中
    }
      
    public void guangDu(){
        new Thread(new Runnable() {
            @Override
            public void run() {
                paxing(HttpUtils.httpGet(url));
            }
        }).start();
    }
      
    public void paxing(String html){
        if(html.lastIndexOf("下一页
")<0) return ; String strList = html.substring(html.indexOf("
  • "), html.lastIndexOf("下一页
  • ")); String url = RegexUtils.RegexString(" page = new Gxpage(urls.take()); List<>> list = page.ybhqSection().getSections(); for(Section section : list){ new Thread(new Runnable() { @Override public void run() { mStartSignal.countDown();// 计数减一为0,工作线程真正启动具体操作 try { mStartSignal.await();// 阻塞,等待mStartSignal计数为0运行后面的代码 // 所有的工作线程都在等待同一个启动的命令 } catch (InterruptedException e) { e.printStackTrace(); } DuanZi duanzi = section.select().getModel(); System.out.println(duanzi.getTitle()); mDoneSignal.countDown();// 完成以后计数减一 } } ).start(); } try { mDoneSignal.await();// 等待所有工作线程结束 } catch (InterruptedException e) { e.printStackTrace(); } dxcPx();//线程任务执行完后,再次获取url队列进行任务 } public static void main(String[] args) { WebSite web = new WebSite("http://duanziwang.com"); web.guangDu(); for(int i = 0; i<10;i++){ new Thread(new Runnable() { @Override public void run() { web.dxcPx(); } }).start(); } } } //该余额宝领红包码来自于http://www.codesnippet.cn/detail/2408201715272.html