///////////////////////////////////////////////////////// // Module SearchForText.java // Version 1.03 // Language Java // Author Sudhakar Chandrasekaran (thaths@netscape.com) // History // 05/17/97 House cleaning // 05/16/97 Implemented the crawler from JavaWorld Magazine // 04/20/97 Complex searches // // Legalese // Copyright (C) 1996-1997 Sudhakar Chandrasekharan // // This program is free software; you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the // Free Software Foundation; either version 2 of the license, or (at your // option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. Writers of free software // cannot afford getting sued! // // You should have received a copy of the GNU General Public License along // with this program; if not, write to the Free Software Foundation, Inc., // 675 Mass Ave., Cambridge, MA 02139, USA. // // Sudhakar "Thaths" Chandrasekharan, 20 April 1997. //////////////////////////////////////////////////////// import java.awt.*; import java.net.*; import java.io.*; import java.util.StringTokenizer; import java.util.Enumeration; import java.util.Vector; import java.util.Hashtable; public class SearchForText extends Thread { // The maximum number of simultaneous threads public final static int MAX_SIMULTANEOUS_THREADS = 4; // TrafficCop is a class that makes sure that not more than // MAX_SIMULTANEOUS_THREADS are running simultaneously static TrafficCop trafficCop = new TrafficCop(MAX_SIMULTANEOUS_THREADS); URL pageToSearch; MyFrame myFrame; String searchString; Vector subStrings = new Vector(5); // Constructor SearchForText(MyFrame mf, String page, String string) { myFrame = mf; try { pageToSearch = new URL(page); setName(page); start(); } catch (MalformedURLException e) { System.err.println("ClientSearch: Not Searching " + page + " - Bad URL."); this.stop(); } searchString = new String(string); // If boolean search, split up the search string // into substrings if (myFrame.optionsPanel.isBooleanSearch()) { StringTokenizer st = new StringTokenizer(searchString); subStrings.ensureCapacity(st.countTokens()); while (st.hasMoreTokens()) { subStrings.addElement((String) st.nextToken()); } } else { subStrings.addElement(searchString); } } public void run() { myFrame.setCursor(Frame.WAIT_CURSOR); myFrame.searchPanel.searchButton.disable(); myFrame.currentPageNumber++; if (myFrame.currentPageNumber > myFrame.clientSearch.totalPages) { System.out.println("ClientSearch: More than" + myFrame.clientSearch.totalPages + "pages have been searched. So stopping."); this.stop(); } int token; // This thread will run only if it gets a token from the trafficCop String contentsOfPage; // The entire page is stored in this string Vector linksInPage; // The parsed links in a page are stored here boolean isMatch; // Before searching a page, get a token from the trafficCop // getToken() will block if more than MAX_SIMULTANEOUS_THREADS // are already running. token = trafficCop.getToken(); // Grab the contents of a page contentsOfPage = getPage(pageToSearch); // Grab the links in the page to crawl linksInPage = extractLinks(contentsOfPage); // Have three seperate methods for searching // Only execute the required method if (myFrame.optionsPanel.isORSearch()) { isMatch = matchOR(contentsOfPage, subStrings); } else if (myFrame.optionsPanel.isANDSearch()) { isMatch = matchAND(contentsOfPage, subStrings); } else { isMatch = matchAsAString(contentsOfPage, searchString); } // If there was a match in the page // add the result to the list of results in resultPanel if (isMatch) { String title; if ((title = extractTitle(contentsOfPage)) == null) { // No valid title was specified in the page // Use the page's URL instead myFrame.resultPanel.addResult(pageToSearch.toString(),pageToSearch.toString()); } else { myFrame.resultPanel.addResult(title,pageToSearch.toString()); } } // For all the links in the page // check to see if they have already been visited // if not visited, spawn a speerate thread to search it Enumeration enumLinks = linksInPage.elements(); while(enumLinks.hasMoreElements()) { String nextPage = (String) enumLinks.nextElement(); if (! alreadyVisited(nextPage)) { haveVisited(nextPage); new SearchForText(myFrame, nextPage, searchString); } else { // System.out.println("Have already visited " + nextPage.toString()); } } // Finished searching. So return token for the other strings trafficCop.returnToken(token); try { Thread.sleep( (int) (Math.random()*200)); } catch (Exception e) { System.err.println("ClientSearch: Unknown error while sleeping:" + e.getMessage()); } myFrame.searchPanel.searchButton.enable(); myFrame.setCursor(Frame.DEFAULT_CURSOR); } protected String getPage (URL dukeOfEarl) { // Don't use any of the foll boolean values // Will use later for HTML parsing boolean beginTag = false; boolean endTag = false; boolean insideScript = false; // Don't use this right now. Will eventually. InputStream conn = null; DataInputStream data = null; String line; StringBuffer buff = new StringBuffer(); try { conn = dukeOfEarl.openStream(); data = new DataInputStream(conn); while ((line=data.readLine()) != null) { // System.out.println(line); buff.append(line); } } catch (IOException e) { System.out.println("ClientSearch: IOError while searching: " + e.getMessage()); // e.printStackTrace(); return ""; } return buff.toString(); } // Method to see if a page hs already been visited protected boolean alreadyVisited(String page) { return myFrame.clientSearch.pageDB.containsKey(page); } // Method to mark a page as visited protected void haveVisited(String page) { myFrame.clientSearch.pageDB.put(page, page); } // Method to do the AND search // Slowest of all the match methods. For a sucess, // all the substrings have to be searched for protected boolean matchAND(String content, Vector subStrs) { Enumeration enumSubstrings = subStrs.elements(); while(enumSubstrings.hasMoreElements()) { String subString = (String) enumSubstrings.nextElement(); // for each substring if (content.indexOf(subString) == -1) { // no match. So return false return false; } } // If we fall out of the loop naturally, that means // that all the substrings were found in the page return true; } // Method to do the OR search // Medium speed. The speed can very between // matchAND(slowest) and matchAsAString(fastest) protected boolean matchOR(String content, Vector subStrs) { Enumeration enumSubstrings = subStrs.elements(); while(enumSubstrings.hasMoreElements()) { String subString = (String) enumSubstrings.nextElement(); // For each of the substrings if (content.indexOf(subString) != -1) { // There is a match. Wohoo! return true; } } // We have fallen out of the loop // No matches were found return false; } // The fastest search method // Note that there is no loop protected boolean matchAsAString(String content, String subString) { if (content.indexOf(subString) != -1) { return true; } else { return false; } } public String extractTitle (String content) { int beginTitleTag, endTitleTag; String buff = new String (content.toUpperCase()); // find the position of the opening TITLE tag // and the closing TITLE tag beginTitleTag = content.indexOf("