解决方法
这是我通过使用Jsoup迭代节点来输出文本(包括换行符)的函数.
- public static String htmlToText(InputStream html) throws IOException {
- Document document = Jsoup.parse(html,null,"");
- Element body = document.body();
- return buildStringFromNode(body).toString();
- }
- private static StringBuffer buildStringFromNode(Node node) {
- StringBuffer buffer = new StringBuffer();
- if (node instanceof TextNode) {
- TextNode textNode = (TextNode) node;
- buffer.append(textNode.text().trim());
- }
- for (Node childNode : node.childNodes()) {
- buffer.append(buildStringFromNode(childNode));
- }
- if (node instanceof Element) {
- Element element = (Element) node;
- String tagName = element.tagName();
- if ("p".equals(tagName) || "br".equals(tagName)) {
- buffer.append("\n");
- }
- }
- return buffer;
- }