*/
public class XMLReader {
private static final String TITLE_TAG = "title"; //$NON-NLS-1$
+
private static final String TIMESTAMP_TAG = "timestamp"; //$NON-NLS-1$
+
private static final String TEXT_TAG = "text"; //$NON-NLS-1$
+
private static final String PAGE_TAG = "page"; //$NON-NLS-1$
public XMLReader() {
private static void traverse(String eleName, Node cNode, Parsed parsed) {
switch (cNode.getNodeType()) {
case Node.DOCUMENT_NODE:
-// System.out.println("DOCUMENT_NODE " + cNode.getNodeName());
+ // System.out.println("DOCUMENT_NODE " + cNode.getNodeName());
processChildren(eleName, cNode.getChildNodes(), parsed);
break;
case Node.ELEMENT_NODE:
eleName = cNode.getNodeName();
-// System.out.println("ELEMENT_NODE " + eleName);
-// NamedNodeMap attributeMap = cNode.getAttributes();
-// int numAttrs = attributeMap.getLength();
-// for (int i = 0; i < attributeMap.getLength(); i++) {
-// Attr attribute = (Attr) attributeMap.item(i);
-// String attrName = attribute.getNodeName();
-// String attrValue = attribute.getNodeValue();
-// }
+ // System.out.println("ELEMENT_NODE " + eleName);
+ // NamedNodeMap attributeMap = cNode.getAttributes();
+ // int numAttrs = attributeMap.getLength();
+ // for (int i = 0; i < attributeMap.getLength(); i++) {
+ // Attr attribute = (Attr) attributeMap.item(i);
+ // String attrName = attribute.getNodeName();
+ // String attrValue = attribute.getNodeValue();
+ // }
processChildren(eleName, cNode.getChildNodes(), parsed);
break;
case Node.CDATA_SECTION_NODE:
}
/**
+ * Read the first timestamp found in the Wikipedia xml stream
+ *
+ * @param stream
+ * @return
+ * @throws Exception
+ */
+ public static String getTimestamp(InputStream stream) throws Exception {
+ // Create a factory object for creating DOM parsers
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ // Now use the factory to create a DOM parser (a.k.a. a DocumentBuilder)
+ DocumentBuilder parser = factory.newDocumentBuilder();
+ // Parse the file and build a Document tree to represent its content
+ Document document = parser.parse(stream);
+ // Ask the document for a list of all <page> tags it contains
+ NodeList timestamps = document.getElementsByTagName(TIMESTAMP_TAG);
+ // Loop through those <mediawiki> elements one at a time, and extract the
+ // content of their <page> tags.
+ int numPages = timestamps.getLength();
+ for (int i = 0; i < numPages; i++) {
+ ElementNode page = (ElementNode) timestamps.item(i); // A <timestamp>
+ return page.getChildNodes().item(0).getNodeValue();
+ }
+ return null;
+ }
+
+ /**
+ * Get the timestamp as java Date Format String
+ *
+ * @param stream
+ * @return
+ * @throws Exception
+ */
+ public static String getDateTimestamp(InputStream stream) throws Exception {
+ String timestamp = getTimestamp(stream);
+ if (timestamp!=null) {
+ StringBuffer buffer = new StringBuffer();
+ // 2004-11-22T12:41:10Z
+ buffer.append(timestamp.substring(0,4)); //year
+ buffer.append(timestamp.substring(5,7)); //month
+ buffer.append(timestamp.substring(8,10)); //day
+ buffer.append(timestamp.substring(11,13));//hour
+ buffer.append(timestamp.substring(14,16));//minute
+ buffer.append(timestamp.substring(17,19));//second
+ return buffer.toString();
+ }
+ return null;
+ }
+
+ /**
* Reads the wikipedia xml data from the given stream
*
* @param stream
- * @return
+ * @return
* @throws CoreException
*/
public static ArrayList readFromStream(Reader stream) throws CoreException {
Document document = parser.parse(new InputSource(stream));
// Ask the document for a list of all <page> tags it contains
NodeList pages = document.getElementsByTagName(PAGE_TAG);
- // Loop through those <mediawiki> elements one at a time, and extract the
- // content of their <page> tags.
+ // Loop through those <page> elements one at a time
int numPages = pages.getLength();
for (int i = 0; i < numPages; i++) {
} catch (IOException e) {
throwReadException(e);
} catch (SAXParseException e) {
-// System.out.println("SAXParseException in line:" + e.getLineNumber() + " column:" + e.getColumnNumber());
+ // System.out.println("SAXParseException in line:" + e.getLineNumber() + " column:" + e.getColumnNumber());
throwReadException(e);
} catch (SAXException e) {
throwReadException(e);
Node node = attributes.getNamedItem(name);
return node == null ? null : node.getNodeValue();
}
-
+
// public static void saveToFile(File file) throws CoreException {
// OutputStream stream = null;
// try {
+ "\r\n" + "[[Kategorie:Rhetorischer Begriff]]\r\n"
+ "[[en:Synaesthesia]] [[es:Sinestesia]] [[sv:Synestesi]] [[tr:Sinestezi]]</text>\r\n" + " </revision>\r\n"
+ " </page>\r\n" + "</mediawiki>";
- StringReader st = new StringReader(test2);
+ StringReader st = new StringReader(test);
- readFromStream(st);
+ System.out.println(readFromStream(st));
} catch (CoreException e) {
// TODO Auto-generated catch block
e.printStackTrace();