nutch1.4 crawl详解

peigang

浏览: 167334 次
性别:
来自: 北京

最近访客更多访客>>

yxmzhg

yexiaoshunfeier

wd1282988143

the12thwolf

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

nutch

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.crawl;

import java.util.*;
import java.text.*;

// Commons Logging imports
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.indexer.solr.SolrIndexer;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

import org.apache.nutch.fetcher.Fetcher;

public class Crawl extends Configured implements Tool {
  public static final Logger LOG = LoggerFactory.getLogger(Crawl.class);

  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format
      (new Date(System.currentTimeMillis()));
  }


  /* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr
     parameter respectively. More information and Usage parameters can be found below. */
  public static void main(String args[]) throws Exception {
    Configuration conf = NutchConfiguration.create();
    int res = ToolRunner.run(conf, new Crawl(), args);
    System.exit(res);
  }
  

  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      System.out.println
      ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
      return -1;
    }
    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    String solrUrl = null;
    
    for (int i = 0; i < args.length; i++) {
      if ("-dir".equals(args[i])) {
        dir = new Path(args[i+1]);
        i++;
      } else if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-topN".equals(args[i])) {
          topN = Integer.parseInt(args[i+1]);
          i++;
      } else if ("-solr".equals(args[i])) {
        solrUrl = args[i + 1];
        i++;
      } else if (args[i] != null) {
        rootUrlDir = new Path(args[i]);
      }
    }
    
    JobConf job = new NutchJob(getConf());

    if (solrUrl == null) {
      LOG.warn("solrUrl is not set, indexing will be skipped...");
    }

    FileSystem fs = FileSystem.get(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);      
      LOG.info("solrUrl=" + solrUrl);
      if (topN != Long.MAX_VALUE)
        LOG.info("topN = " + topN);
    }
    
    Path crawlDb = new Path(dir + "/crawldb");		//设置爬取库目录
    Path linkDb = new Path(dir + "/linkdb");		//链接库目录
    Path segments = new Path(dir + "/segments");	//段信息目录
    Path indexes = new Path(dir + "/indexes");		//索引目录
    Path index = new Path(dir + "/index");

  //初始化各项参数
    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(getConf());	//URL注入器对象；数据下载入口
    Generator generator = new Generator(getConf());	//生成器；生成待下载URL列表
    Fetcher fetcher = new Fetcher(getConf());		//抓取器；按照HTTP协议访问互联网，获取网页数据具体内容。下载过程由下载列表和操作参数控制，直到下载完毕。
    ParseSegment parseSegment = new ParseSegment(getConf());	//解析数据段；数据段（Segment）存放网络爬虫每一次抓取使用的待下载列表、已经获得的网页内容和本次内容的索引。
    CrawlDb crawlDbTool = new CrawlDb(getConf());				//抓取数据库工具
    LinkDb linkDbTool = new LinkDb(getConf());					//链接库工具
      
    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      fetcher.fetch(segs[0], threads);  // fetch it
      if (!Fetcher.isParsing(job)) {
        parseSegment.parse(segs[0]);    // parse it, if needed
      }
      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
    if (i > 0) {
      linkDbTool.invert(linkDb, segments, true, true, false); // invert links

      /**
       * 使用solr索引处理完毕的文件，索引、复制、合并
       */
      if (solrUrl != null) {
        // index, dedup & merge
        FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
        SolrIndexer indexer = new SolrIndexer(getConf());
        indexer.indexSolr(solrUrl, crawlDb, linkDb, 
          Arrays.asList(HadoopFSUtil.getPaths(fstats)));
        SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
        dedup.setConf(getConf());
        dedup.dedup(solrUrl);
      }
      
    } else {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
    return 0;
  }

}

分享到：

nutch1.4 URLNormalizers 详解 | MyEclipse配置IvyDE

2012-03-29 11:16
浏览 1213
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

nutch1.4 crawl详解

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

nutch1.4 crawl详解

评论

发表评论

相关推荐

Nutch1.7二次开发培训讲义

nutch-default.xml 配置范例

nutch本地模式调试环境配置

nutch分布式调试环境配置

nutch 正文提取流程解析

用Eclipse开发nutch准备工作

nutch1.4 CrawlDatum详解

nutch1.4 分布式爬取

nutch1.4：爬虫定时抓取设置

nutch1.4 开发：增加外部jar包

nutch1.4 爬虫父页面参数传递到子页面注意事项

nutch1.4 Fetcher详解

nutch1.4 Protocol接口解析

nutch1.4自定义字段开发实例

nutch1.4插件开发

nutch1.4 解析器 ParseSegment详解

nutch1.4 Generator详解

nutch1.4 ScoringFilter详解

nutch1.4 URLFilter详解

nutch1.4 URLNormalizers 详解

最近访客更多访客>>