java语言

Java如何正则表达式分析读取网页内容

时间:2024-06-29 14:21:17 java语言 我要投稿
  • 相关推荐

Java如何正则表达式分析读取网页内容

  导语:Java如何正则表达式分析读取网页内容?下面是小编给大家提供的代码实现,大家可以参考阅读,更多详情请关注应届毕业生考试网。

  Java用正则表达式分析读取网页内容

  package com.xiaofeng.picup;

  import java.io.BufferedReader;

  import java.io.IOException;

  import java.io.InputStreamReader;

  import java.net.MalformedURLException;

  import java.net.URL;

  import java.util.ArrayList;

  import java.util.HashMap;

  import java.util.List;

  import java.util.regex.Matcher;

  import java.util.regex.Pattern;

  /** *//**

  *

  * @抓取雅虎知识堂的文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个知识堂的全部内容

  *

  */

  public class WebContent ...{

  /** *//**

  * 读取一个网页全部内容

  */

  public String getOneHtml(String htmlurl) throws IOException...{

  URL url;

  String temp;

  StringBuffer sb = new StringBuffer();

  try ...{

  url = new URL(htmlurl);

  BufferedReader in = new BufferedReader(new InputStreamReader(url

  .openStream(), "utf-8"));// 读取网页全部内容

  while ((temp = in.readLine()) != null) ...{

  sb.append(temp);

  }

  in.close();

  }catch(MalformedURLException me)...{

  System.out.println("你输入的URL格式有问题!请仔细输入");

  me.getMessage();

  throw me;

  }catch (IOException e) ...{

  e.printStackTrace();

  throw e;

  }

  return sb.toString();

  }

  /** *//**

  *

  * @param s

  * @return 获得网页标题

  */

  public String getTitle(String s) ...{

  String regex;

  String title = "";

  List<String> list = new ArrayList<String>();

  regex = "<title>.*?</title>";

  Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  for (int i = 0; i < list.size(); i++) ...{

  title = title + list.get(i);

  }

  return outTag(title);

  }

  /** *//**

  *

  * @param s

  * @return 获得链接

  */

  public List<String> getLink(String s) ...{

  String regex;

  List<String> list = new ArrayList<String>();

  regex = "<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 获得脚本代码

  */

  public List<String> getScript(String s) ...{

  String regex;

  List<String> list = new ArrayList<String>();

  regex = "<script.*?</script>";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 获得CSS

  */

  public List<String> getCSS(String s) ...{

  String regex;

  List<String> list = new ArrayList<String>();

  regex = "<style.*?</style>";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 去掉标记

  */

  public String outTag(String s) ...{

  return s.replaceAll("<.*?>", "");

  }