java语言

如何使用Java实现AC自动机全文检索实例

时间:2024-10-24 00:35:43 java语言 我要投稿
  • 相关推荐

如何使用Java实现AC自动机全文检索实例

  导语:如何使用Java实现AC自动机全文检索,下面是小编给大家推荐的代码实现过程,大家可以参考阅读,更多详情请关注应届毕业生考试网。

  第一步,构建Trie树,定义Node类型:

  /**

  * Created by zhaoyy on 2017/2/7.

  */

  interface Node {

  char value();

  boolean exists();

  boolean isRoot();

  Node parent();

  Node childOf(char c);

  Node fail();

  void setFail(Node node);

  void setExists(boolean exists);

  void add(Node child);

  List<Node> children();

  }

  第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:

  /**

  * Created by zhaoyy on 2017/2/8.

  */

  abstract class AbstractNode implements Node {

  private static final char EMPTY = '\0';

  private final char value;

  private final Node parent;

  private boolean exists;

  private Node fail;

  public AbstractNode(Node parent, char value) {

  this.parent = parent;

  this.value = value;

  this.exists = false;

  this.fail = null;

  }

  public AbstractNode() {

  this(null, EMPTY);

  }

  private static String tab(int n) {

  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < n; i++) {

  builder.append('\t');

  }

  return builder.toString();

  }

  private static String toString(Node node, int depth) {

  StringBuilder builder = new StringBuilder();

  String tab = tab(depth);

  Node fail = node.fail();

  Node parent = node.parent();

  builder

  .append(tab)

  .append('<')

  .append(node.value())

  .append(" exists=\"")

  .append(node.exists())

  .append('"')

  .append(" parent=\"")

  .append(parent == null ? "null" : parent.value())

  .append('"')

  .append(" fail=\"")

  .append(fail == null ? "null" : fail.value())

  .append("\">\r\n");

  for (Node child : node.children())

  builder.append(toString(child, depth + 1));

  builder

  .append(tab)

  .append("</")

  .append(node.value())

  .append(">\r\n");

  return builder.toString();

  }

  @Override

  public char value() {

  return value;

  }

  @Override

  public boolean exists() {

  return exists;

  }

  @Override

  public boolean isRoot() {

  return value == EMPTY;

  }

  @Override

  public Node parent() {

  return parent;

  }

  @Override

  public Node fail() {

  return fail;

  }

  @Override

  public void setFail(Node node) {

  this.fail = node;

  }

  @Override

  public void setExists(boolean exists) {

  this.exists = exists;

  }

  @Override

  public String toString() {

  return toString(this, 0);

  }

  }

  /////////////////////////////////////////////////////////////////////////////////////////////

  /**

  * Created by zhaoyy on 2017/2/8.

  */

  final class AsciiNode extends AbstractNode implements Node {

  private static final char FROM = 32;

  private static final char TO = 126;

  private final Node[] children;

  public AsciiNode() {

  super();

  this.children = new Node[TO - FROM + 1];

  }

  public AsciiNode(Node parent, char value) {

  super(parent, value);

  this.children = new Node[TO - FROM + 1];

  }

  @Override

  public Node childOf(char c) {

  if (c >= FROM && c <= TO)

  return children[(int) c - FROM];

  else return null;

  }

  @Override

  public void add(Node child) {

  int index = (int) child.value();

  children[index - FROM] = child;

  }

  @Override

  public List<Node> children() {

  List<Node> nodes = new ArrayList<Node>();

  for (Node child : children)

  if (child != null)

  nodes.add(child);

  return nodes;

  }

  }

  //////////////////////////////////////////////////////////////////////////////////////////////

  /**

  * Created by zhaoyy on 2017/2/8.

  */

  final class MapNode extends AbstractNode implements Node {

  private final Map<Character, Node> children;

  public MapNode() {

  super();

  this.children = new HashMap<Character, Node>();

  }

  public MapNode(Node parent, char value) {

  super(parent, value);

  this.children = new HashMap<Character, Node>();

  }

  @Override

  public Node childOf(char c) {

  return children.get(c);

  }

  @Override

  public void add(Node child) {

  children.put(child.value(), child);

  }

  @Override

  public List<Node> children() {

  List<Node> nodes = new ArrayList<Node>();

  nodes.addAll(children.values());

  return nodes;

  }

  }

  第三步,

  首先定义一个Node构造器:

  /**

  * Created by zhaoyy on 2017/2/8.

  */

  public interface NodeMaker {

  Node make(Node parent, char value);

  Node makeRoot();

  }

  然后构建AC自动机,实现创建及查找方法:

  /**

  * Created by zhaoyy on 2017/2/7.

  */

  public final class WordTable {

  private final Node root;

  private WordTable(Collection<? extends CharSequence> words, NodeMaker maker) {

  Node root = buildTrie(words, maker);

  setFailNode(root);

  this.root = root;

  }

  public static WordTable compile(Collection<? extends CharSequence> words) {

  if (words == null || words.isEmpty())

  throw new IllegalArgumentException();

  final NodeMaker maker;

  if (isAscii(words))

  maker = new NodeMaker() {

  @Override

  public Node make(Node parent, char value) {

  return new AsciiNode(parent, value);

  }

  @Override

  public Node makeRoot() {

  return new AsciiNode();

  }

  };

  else maker = new NodeMaker() {

  @Override

  public Node make(Node parent, char value) {

  return new MapNode(parent, value);

  }

  @Override

  public Node makeRoot() {

  return new MapNode();

  }

  };

  return new WordTable(words, maker);

  }

  private static boolean isAscii(Collection<? extends CharSequence> words) {

  for (CharSequence word : words) {

  int len = word.length();

  for (int i = 0; i < len; i++) {

  int c = (int) word.charAt(i);

  if (c < 32 || c > 126)

  return false;

  }

  }

  return true;

  }

  private static Node buildTrie(Collection<? extends CharSequence> sequences, NodeMaker maker) {

  Node root = maker.makeRoot();

  for (CharSequence sequence : sequences) {

  int len = sequence.length();

  Node current = root;

  for (int i = 0; i < len; i++) {

  char c = sequence.charAt(i);

  Node node = current.childOf(c);

  if (node == null) {

  node = maker.make(current, c);

  current.add(node);

  }

  current = node;

  if (i == len - 1)

  node.setExists(true);

  }

  }

  return root;

  }

  private static void setFailNode(final Node root) {

  root.setFail(null);

  Queue<Node> queue = new LinkedList<Node>();

  queue.add(root);

  while (!queue.isEmpty()) {

  Node parent = queue.poll();

  Node temp;

  for (Node child : parent.children()) {

  if (parent.isRoot())

  child.setFail(root);

  else {

  temp = parent.fail();

  while (temp != null) {

  Node node = temp.childOf(child.value());

  if (node != null) {

  child.setFail(node);

  break;

  }

  temp = temp.fail();

  }

  if (temp == null)

  child.setFail(root);

  }

  queue.add(child);

  }

  }

  }

  public boolean findAnyIn(CharSequence cs) {

  int len = cs.length();

  Node node = root;

  for (int i = 0; i < len; i++) {

  Node next = node.childOf(cs.charAt(i));

  if (next == null) {

  next = node.fail();

  if (next == null) {

  node = root;

  continue;

  }

  }

  if (next.exists())

  return true;

  }

  return false;

  }

  public List<MatchInfo> search(CharSequence cs) {

  if (cs == null || cs.length() == 0)

  return Collections.emptyList();

  List<MatchInfo> result = new ArrayList<MatchInfo>();

  int len = cs.length();

  Node node = root;

  for (int i = 0; i < len; i++) {

  Node next = node.childOf(cs.charAt(i));

  if (next == null) {

  next = node.fail();

  if (next == null) {

  node = root;

  continue;

  }

  }

  if (next.exists()) {

  MatchInfo info = new MatchInfo(i, next);

  result.add(info);

  node = root;

  continue;

  }

  node = next;

  }

  return result;

  }

  @Override

  public String toString() {

  return root.toString();

  }

  }

  定义一个保存查找结果的实体:

  /**

  * Created by zhaoyy on 2017/2/7.

  */

  public final class MatchInfo {

  private final int index;

  private final String word;

  public MatchInfo(int index, String word) {

  this.index = index;

  this.word = word;

  }

  public MatchInfo(int index, Node node) {

  StringBuilder builder = new StringBuilder();

  while (node != null) {

  if (!node.isRoot())

  builder.append(node.value());

  node = node.parent();

  }

  String word = builder.reverse().toString();

  this.index = index + 1 - word.length();

  this.word = word;

  }

  public int getIndex() {

  return index;

  }

  public String getWord() {

  return word;

  }

  @Override

  public String toString() {

  return index + ":" + word;

  }

  }

  第四步,调用Demo:

  public static void main(String[] args) {

  List<String> list = Arrays.asList("say", "her", "he", "she", "shr", "alone");

  WordTable table = WordTable.compile(list);

  System.out.println(table);

  System.out.println(table.search("1shesaynothingabouthislivinghimalone"));

  }

  以下是输出结果:

  < exists="false" parent="null" fail="null">

  <s exists="false" parent=" " fail=" ">

  <a exists="false" parent="s" fail="a">

  <y exists="true" parent="a" fail=" ">

  </y>

  </a>

  <h exists="false" parent="s" fail="h">

  <e exists="true" parent="h" fail="e">

  </e>

  <r exists="true" parent="h" fail=" ">

  </r>

  </h>

  </s>

  <h exists="false" parent=" " fail=" ">

  <e exists="true" parent="h" fail=" ">

  <r exists="true" parent="e" fail=" ">

  </r>

  </e>

  </h>

  <a exists="false" parent=" " fail=" ">

  &lt;l exists="false" parent="a" fail=" ">

  <o exists="false" parent="l" fail=" ">

  <n exists="false" parent="o" fail=" ">

  <e exists="true" parent="n" fail=" ">

  </e>

  </n>

  </o>

  &lt;/l>

  </a>

  </ >

  [1:she, 4:say, 31:alone]

【如何使用Java实现AC自动机全文检索实例】相关文章:

java通用组合算法如何实现09-12

Java实现在不同线程中运行的代码实例详解06-11

如何使用PS实现皮肤美白07-07

PHP如何使用curl实现数据抓取09-27

Java入门教程:如何使用一个Java06-12

java调用c函数的实例09-16

如何使用javascript实现瀑布流及效果加载06-17

Java队列类编写实例09-05

java读取解析xml文件实例08-05

Java中的多态用法实例分析10-23