package csm23; import java.io.*; import java.util.*; import java.text.*; import java.lang.*; import org.apache.xalan.xpath.xml.*; // Import packages dealing with XSL transformation import org.apache.xalan.xslt.*; import org.apache.xalan.xpath.*; import org.xml.sax.*; import org.w3c.dom.*; import org.apache.xerces.dom.*; import org.apache.xerces.parsers.*; import org.apache.xml.serialize.*; import javax.xml.parsers.*; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * Title: Process XML type Reuters news * Copyright: Copyright (c) 2002 * Company: * @author Tugba Taskaya * @version 1.0 */ public class GetNews { private Document doc = null; private XPathHelper xPathHelper = new XPathHelper(); public ArrayList news_vector; /** * Sets the filename to be processed * @param fileName the file name */ public GetNews( String fileName) { news_vector = new ArrayList(); //getDocument( fileName ); } public GetNews( ) { news_vector = new ArrayList(); } /** * This method returns the documents which satify the given query. * Query values are sent via this method. For example, if the user chose US and chemical industry type * news, these query items should be put to "industry_query" and "country_query" arraylists. * @param path1 the directory of files * @param industry_query the query values for industry category * @param country_query the query values for country category * @param topic_query query values for topic category * @return an arraylist comprising the XMLDocument objects */ public ArrayList getWholeDocuments(ArrayList path1, ArrayList industry_query, ArrayList country_query, ArrayList topic_query ) { ListIterator l = path1.listIterator(); for(long i=0;i< path1.size();i++){ //System.out.println(i+" "+path[i]+" "+Runtime.getRuntime().totalMemory()+" "+Runtime.getRuntime().freeMemory()); getDocument( (String)(l.next()), industry_query, country_query, topic_query); //System.out.println("M:"+(Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory())); System.out.flush(); } return news_vector; } /** * Extracts the content of the given file * @param fileName the file to be processed * @return the content of the file (main body of the file) */ public String [] getContentOfDocument (String fileName) { String content[] ; InputSource is = null; try { is = new InputSource(new FileInputStream(fileName)); } catch (IOException ioe) { } Document doc = null; try { DOMParser parser = new DOMParser(); parser.parse(is); doc = parser.getDocument(); } catch (IOException ioe) {System.out.println("IO Exception"); } catch (SAXException saxe) { System.out.println("SAX Exception"); } try { NodeList nl_content = xPathHelper.processXPath("//text/p", doc.getDocumentElement() ); content = new String[nl_content.getLength()]; for( int i=0; i < nl_content.getLength(); i++ ) { Element e = (Element)nl_content.item(i); content[i] = new String(DOMUtils.getMergedTextChildren( e )); //System.out.println( content[i] ); } return content; } catch( Exception ex ) { ex.printStackTrace(); } return null; } /** * Extracts all the information from the given file based on the query entries * @param fileName the name of file * @param industry_query the query values for industry category * @param country_query the query values for country category * @param topic_query query values for topic category * */ private void getDocument( String fileName, ArrayList industry_query, ArrayList country_query, ArrayList topic_query ) { InputSource is = null; try { is = new InputSource(new FileInputStream(fileName)); } catch (IOException ioe) { } Document doc = null; try { DOMParser parser = new DOMParser(); parser.parse(is); doc = parser.getDocument(); } catch (IOException ioe) {System.out.println("IO Exception"); } catch (SAXException saxe) { System.out.println("SAX Exception"); } catch (Exception xe) { System.out.println("Exception"); } try { XMLDocument tmp_xmldoc = new XMLDocument(3); NodeList nl_code = xPathHelper.processXPath("//codes[@class='bip:countries:1.0']/code", doc.getDocumentElement() ); tmp_xmldoc.setLength(0, nl_code.getLength()); //System.out.println(nl_code.getLength()); NodeList nl_code1 = xPathHelper.processXPath("//codes[@class='bip:industries:1.0']/code", doc.getDocumentElement() ); tmp_xmldoc.setLength(1, nl_code1.getLength()); //System.out.println(nl_code1.getLength()); NodeList nl_code2 = xPathHelper.processXPath("//codes[@class='bip:topics:1.0']/code", doc.getDocumentElement() ); tmp_xmldoc.setLength(2, nl_code2.getLength()); //System.out.println(nl_code2.getLength()); if(nl_code.getLength()>=nl_code1.getLength() && nl_code.getLength()>=nl_code2.getLength()) tmp_xmldoc.setAttributeDimension(nl_code.getLength()); if(nl_code1.getLength()>=nl_code.getLength() && nl_code1.getLength()>=nl_code2.getLength()) tmp_xmldoc.setAttributeDimension(nl_code1.getLength()); if(nl_code2.getLength()>=nl_code.getLength() && nl_code2.getLength()>=nl_code1.getLength()) tmp_xmldoc.setAttributeDimension(nl_code2.getLength()); //System.out.println(tmp_xmldoc.attributeCodes.length); boolean if_right_doc = true; if(country_query!=null && topic_query!=null && industry_query!=null && (nl_code.getLength(), or return null. * @param sTag java.lang.String */ public static Element getFirstChildTag(Element e, String sTag) { NodeList nl = e.getElementsByTagName(sTag); if (nl == null || nl.getLength() == 0) return null; return (Element)(nl.item(0)); } /** * Return the first child node of this node that is an Element, or * none if there aren't any. * @return org.w3c.dom.Element * @param n org.w3c.dom.Node */ public static Element getFirstElementChild(Node node) { NodeList nl = node.getChildNodes(); Element eResult = null; for (int i = 0; i < nl.getLength(); i++) { Node n = nl.item(i); if (nl instanceof Element) return (Element)(n); } return null; } /** * Merge all Text child nodes of e, stripping external whitespace. Return an * empty string if there are no such nodes, or if there's only * whitespace. If it's mixed content, non-text nodes are eliminated. * Internal carriage returns are preserved. * @return java.lang.String * @param e org.w3c.dom.Element */ public static String getMergedTextChildren(Node node) { NodeList nl = node.getChildNodes(); String sResult = ""; for (int i = 0; i < nl.getLength(); i++) { Node n = nl.item(i); if (n instanceof Text) { Text t = (Text) n; String sThis = t.getData(); sResult = sResult + sThis; } else { // Handle entity references recursively if (n instanceof EntityReference) { String sTextOfEntity = getMergedTextChildren(n); sResult = sResult + sTextOfEntity; } } } sResult = sResult.trim(); return sResult; } }