I'm facing style issue while converting .doc files to html texts using Apache POI's HWPFDocument class. Another problem is that it converts style tags too like this:
.b1{white-space-collapsing:preserve;} .b2{margin: 1.1798611in 1.1798611in 1.1798611in 1.1798611in;} .s1{font-weight:bold;color:black;} .s2{color:black;} .s3{font-style:italic;color:black;} .p1{text-align:center;hyphenate:none;font-family:Times New Roman;font-size:12pt;} .p2{text-align:justify;hyphenate:none;font-family:Times New Roman;font-size:12pt;} .p3{text-align:end;hyphenate:none;font-family:Times New Roman;font-size:12pt;}
Main Title
Here's my code:
HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(multipartFile.getInputStream());
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.processDocument(wordDocument);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
String html = new String(out.toByteArray());
All I need is getting .doc file's content into HTML text format properly.
Related
Left side table is distorted while comparing two HTML table data using daisydiff.jar.
I need your support to fix this issue. Thanks in advance
Using below code
StringWriter finalResult = new StringWriter();
SAXTransformerFactory tf = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler result = tf.newTransformerHandler();
result.getTransformer().setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
result.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
result.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
result.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
result.setResult(new StreamResult(finalResult));
ContentHandler postProcess = result;
Locale locale = Locale.getDefault();
String prefix = "diff";
NekoHtmlParser cleaner = new NekoHtmlParser();
InputSource oldSource = new InputSource (new String reader(html1));
InputSource newSource = new InputSource (new String reader(html2));
DomTreeBuilder oldHandler = new DomTreeBuilder ();
cleaner.parse(oldSource, oldHandler);
TextNodeComparator leftComparator = new TextNodeComparator (oldHandler, locale);
DomTreeBuilder newHandler = new DomTreeBuilder ();
cleaner.parse(newSource, newHandler);
TextNodeComparator rightComparator = new TextNodeComparator (newHandler, locale);
HtmlSaxDiffOutput output = new HtmlSaxDiffOutput (postProcess, prefix);
HTMLDiffer differ = new HTMLDiffer(output);
differ.diff(leftComparator, rightComparator);
Am exporting html content(user typed content-paragraph-hmtl page) to MS-word(doc). When i exporting content to word using C# Click event. Its work fine. But it has some blue border around the content. enter image description here
Response.Clear();
Response.Buffer = true;
Response.AddHeader("content-disposition", "attachment;filename=" + Candidatename + "_" + filename + ".doc");
Response.ContentType = "application/vnd.ms-word";
tb = new Table();
TableRow tr1 = new TableRow();
TableCell cell1 = new TableCell();
tr1.Attributes.Add("style", "border: 0px");
tb.Attributes.Add("style", "border: 0px");
cell1.Attributes.Add("style", "border: 0px");
cell1.Text = Convert.ToString(str);
tr1.Cells.Add(cell1);
tb.Rows.Add(tr1);
StringWriter sw = new StringWriter();
HtmlTextWriter hw = new HtmlTextWriter(sw);
tb.RenderControl(hw);
variable "str" get the value from sql server(Dynamic data which was typed by the user in text area control .
It's a table, where contents are in.
If you Click on Layout / View Gridlines, wich are activated, you'll deactivate them and these lines dissapear.
When you print that document, these lines will not be printed
Hello guys I am generating a Payment Invoice order in PDF from my html content and sending it by e-mail with the following code:
***//Generates PDF Payment Invoice***
StringBuilder sb = new StringBuilder();
sb.Append(#"<meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"">");
sb.Append(boletoBancario.MontaHtml());
StringReader sr = new StringReader(sb.ToString());
Document pdfDoc = new Document(PageSize.A4, 10f, 10f, 10f, 0f);
HTMLWorker htmlparser = new HTMLWorker(pdfDoc);
byte[] bytes;
memoryStream = new MemoryStream();
PdfWriter writer = PdfWriter.GetInstance(pdfDoc, memoryStream);
pdfDoc.Open();
iTextSharp.text.html.simpleparser.StyleSheet ST = new iTextSharp.text.html.simpleparser.StyleSheet();
ST.LoadTagStyle("body", "encoding", "Identity-H");
htmlparser.SetStyleSheet(ST);
htmlparser.Parse(sr);
pdfDoc.Close();
bytes = memoryStream.ToArray();
memoryStream.Close();
memoryStream = new MemoryStream(bytes);
***//Sends E-mail with PDF PAYMENT INVOICE Attached***
MailAddress de = new MailAddress(enderecoOrigem, HttpUtility.HtmlDecode(nomeOrigem));
MailAddress para = new MailAddress(enderecoDestino, HttpUtility.HtmlDecode(nomeDestino));
MailMessage mensagem = new MailMessage(de, para);
NetworkCredential credential = new NetworkCredential(usuarioConta, senhaConta);
SmtpClient smtp = new SmtpClient();
smtp.Host = servidorSMTP;
smtp.Port = Convert.ToInt32(porta);
MailAddressCollection comCopia;
Attachment att = new Attachment(memoryStream, "Boleto.pdf", MediaTypeNames.Application.Pdf);
mensagem.Attachments.Add(att);
mensagem.Subject = "Payment Invoice";
mensagem.Body = String.Format("Your payment invoice is available.");
mensagem.IsBodyHtml = true;
smtp.UseDefaultCredentials = true;
smtp.EnableSsl = false;
smtp.Send(mensagem);
The problem is that the PDF attached to the email does not render correctly the HTML so it stills unformmated. Otherwise when i create a blank file and put the entire HTML and open it using Chrome it's pretty well formated.
I need to get this PDF correctly attached to the e-mail.
Could somebody help me?Here You can see the Rendering Problem
Finally I've found the solution to my problem!
It was necessary to use itextSharp.xmlWorker library and do some changes in the code-behind. The reason is HTMLWorker really does not resolve CSS, so I had to use XMLWorker instead and do like following:
//Geração de PDF
StringBuilder sb = new StringBuilder();
StringReader sr;
Document pdfDoc;
PdfWriter writer;
byte[] bytes;
sb.Append(boletoBancario.MontaHtml());
sr = new StringReader(sb.ToString().Replace("<br />","<b></b>").Replace("<br>","<br></br>"));
pdfDoc = new Document(PageSize.A4, 30, 30, 30, 30);
writer = PdfWriter.GetInstance(pdfDoc, memoryStream);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
ICSSResolver cssResolver = XMLWorkerHelper.GetInstance().GetDefaultCssResolver(false);
IPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext, new PdfWriterPipeline(pdfDoc, writer)));
XMLWorker worker = new XMLWorker(pipeline, true);
XMLParser xmlParser = new XMLParser(worker);
pdfDoc.Open();
xmlParser.Parse(sr);
pdfDoc.Close();
bytes = memoryStream.ToArray();
memoryStream.Close();
return new MemoryStream(bytes);
Thanks you all btw!
I would like to create dynamic PDF documents using HTML and dynamic images. My code works fine with standard HTML and full paths for the images, but when I try to embed the image inline in the document I get the error
Exception Details: System.IO.IOException: The document has no pages.
Is there a way to embed the images without an HTTP call per image? I don't want that because I think it will cause scalability issues and the images are sensitive.
Here is my code that gives the IOException:
public ActionResult MakePdf()
{
string html = #"<?xml version=""1.0"" encoding=""UTF-8""?>
<!DOCTYPE html
PUBLIC ""-//W3C//DTD XHTML 1.0 Strict//EN""
""http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"">
<html xmlns=""http://www.w3.org/1999/xhtml"" xml:lang=""en"" lang=""en"">
<head>
<title>Minimal XHTML 1.0 Document with W3C DTD</title>
</head>
<body><img src='' width='62' height='80' style='float: left; margin-right: 28px;' /></body></html>";
var bytes = Encoding.UTF8.GetBytes(html);
using (MemoryStream input = new MemoryStream(bytes))
{
MemoryStream output = new MemoryStream();
using (Document document = new Document(PageSize.LETTER, 50, 50, 50, 50))
{
using (PdfWriter writer = PdfWriter.GetInstance(document, output))
{
writer.CloseStream = false;
document.Open();
XMLWorkerHelper xmlWorker = XMLWorkerHelper.GetInstance();
xmlWorker.ParseXHtml(writer, document, input, null);
document.Close();
output.Position = 0;
return new FileStreamResult(output, "application/pdf");
}
}
}
}
We need to write our own ImageTagProcessor to support processing of base 64 images:
public class CustomImageTagProcessor : iTextSharp.tool.xml.html.Image
{
public override IList<IElement> End(IWorkerContext ctx, Tag tag, IList<IElement> currentContent)
{
IDictionary<string, string> attributes = tag.Attributes;
string src;
if (!attributes.TryGetValue(HTML.Attribute.SRC, out src))
return new List<IElement>(1);
if (string.IsNullOrEmpty(src))
return new List<IElement>(1);
if (src.StartsWith("data:image/", StringComparison.InvariantCultureIgnoreCase))
{
// data:[<MIME-type>][;charset=<encoding>][;base64],<data>
var base64Data = src.Substring(src.IndexOf(",") + 1);
var imagedata = Convert.FromBase64String(base64Data);
var image = iTextSharp.text.Image.GetInstance(imagedata);
var list = new List<IElement>();
var htmlPipelineContext = GetHtmlPipelineContext(ctx);
list.Add(GetCssAppliers().Apply(new Chunk((iTextSharp.text.Image)GetCssAppliers().Apply(image, tag, htmlPipelineContext), 0, 0, true), tag, htmlPipelineContext));
return list;
}
else
{
return base.End(ctx, tag, currentContent);
}
}
}
Then we can inject this new processor into the HtmlPipelineContext:
using (var doc = new Document(PageSize.A4))
{
var writer = PdfWriter.GetInstance(doc, new FileStream("test.pdf", FileMode.Create));
doc.Open();
var html = #"<img src='' width='62' height='80' style='float: left; margin-right: 28px;' />";
var tagProcessors = (DefaultTagProcessorFactory)Tags.GetHtmlTagProcessorFactory();
tagProcessors.RemoveProcessor(HTML.Tag.IMG); // remove the default processor
tagProcessors.AddProcessor(HTML.Tag.IMG, new CustomImageTagProcessor()); // use our new processor
CssFilesImpl cssFiles = new CssFilesImpl();
cssFiles.Add(XMLWorkerHelper.GetInstance().GetDefaultCSS());
var cssResolver = new StyleAttrCSSResolver(cssFiles);
cssResolver.AddCss(#"code { padding: 2px 4px; }", "utf-8", true);
var charset = Encoding.UTF8;
var hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
hpc.SetAcceptUnknown(true).AutoBookmark(true).SetTagFactory(tagProcessors); // inject the tagProcessors
var htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(doc, writer));
var pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
var worker = new XMLWorker(pipeline, true);
var xmlParser = new XMLParser(true, worker, charset);
xmlParser.Parse(new StringReader(html));
}
Process.Start("test.pdf");
string originalFile = "Original1.pdf";
string copyOfOriginal = "Re-copia.pdf";
byte[] bytes = Convert.FromBase64String(archivo);
System.IO.FileStream stream = new FileStream(originalFile, FileMode.CreateNew);
System.IO.BinaryWriter writer = new BinaryWriter(stream);
writer.Write(bytes, 0, bytes.Length);
writer.Close();
PdfReader reader1 = new PdfReader(originalFile);
using (FileStream fs = new FileStream(copyOfOriginal, FileMode.Create, FileAccess.Write, FileShare.None))
// Creating iTextSharp.text.pdf.PdfStamper object to write
// Data from iTextSharp.text.pdf.PdfReader object to FileStream object
using (PdfStamper stamper = new PdfStamper(reader1, fs))
{
int pageCount = reader1.NumberOfPages;
// Create New Layer for Watermark
PdfLayer layer = new PdfLayer("WatermarkLayer", stamper.Writer);
// Loop through each Page
for (int i = pageCount; i <= pageCount; i++)
{
// Getting the Page Size
Rectangle rect = reader1.GetPageSize(i);
// Get the ContentByte object
PdfContentByte cb = stamper.GetUnderContent(i);
// Tell the cb that the next commands should be "bound" to this new layer
cb.BeginLayer(layer);
cb.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 50);
PdfGState gState = new PdfGState();
cb.SetGState(gState);
string codbartest = codBarras;
BarcodePDF417 bcpdf417 = new BarcodePDF417();
//Asigna el código de barras en base64 a la propiedad text del objeto..
bcpdf417.Text = ASCIIEncoding.ASCII.GetBytes(codbartest);
Image imgpdf417 = bcpdf417.GetImage();
imgpdf417.SetAbsolutePosition(50, 50);
imgpdf417.ScalePercent(100);
cb.AddImage(imgpdf417);
// Close the layer
cb.EndLayer();
}[enter image description here][1]
I have mp4+vtt subtitle video player project on Wp8 c#. I'm look at Microsoft.PlayerFramework.MediaPlayer and WebVTTPlugin, its works perfect:
https://playerframework.codeplex.com/wikipage?title=Closed%20Captions%3a%20WebVTT
im use this code and works perfect. But i have bad luck my project .vtt caption files have "," decimal seperator, this mean my app chrash i need to download subtitle and replace all "," to "." and save isolated storage, i handle it but i cant set caption source to isolated storage because isolated storage is not have uri. I know i cant tell well, i tell it with example:
my caption:
(http://dizilab.com/captions/chuck/sezon-1/tr/2.vtt?v=5.2)
1
00:00:06,600 --> 00:00:10,900
Merhaba. Benim adım Charles Bartowski,
ama bana Chuck diyebilirsiniz.
2
00:00:11,323 --> 00:00:12,711
Bunlar benim ayakkabılarım.
3
00:00:12,771 --> 00:00:14,209
Bu da benim hayatım.
4
00:00:14,249 --> 00:00:19,042
Casuslar, araba takipleri, bilgisayar
çalan ninjalar ve günü kurtaran ben.
its true caption is:
WEBVTT FILE
1
00:00:06.600 --> 00:00:10.900 Merhaba. Benim adım
Charles Bartowski, ama bana Chuck diyebilirsiniz.
2 00:00:11.323 --> 00:00:12.711 Bunlar benim ayakkabılarım.
3 00:00:12.771 --> 00:00:14.209 Bu da benim hayatım.
4 00:00:14.249 --> 00:00:19.042 Casuslar, araba takipleri, bilgisayar
çalan ninjalar ve günü kurtaran ben.
and it is code:
using Microsoft.PlayerFramework.WebVTT;
using System.IO.IsolatedStorage;
using System.IO;
using System.Threading.Tasks;
using Microsoft.PlayerFramework;
namespace PanoramaApp1
{
public partial class MainPage : PhoneApplicationPage
{
// Constructor
public string alinanveri="";
public MainPage()
{
InitializeComponent();
Microsoft.PlayerFramework.MediaPlayer player =new Microsoft.PlayerFramework.MediaPlayer();
Microsoft.PlayerFramework.WebVTT.WebVTTPlugin webvttPlugin = new WebVTTPlugin();
Microsoft.PlayerFramework.Caption caption = new Microsoft.PlayerFramework.Caption();
player.IsCaptionSelectionVisible = true;
player.Plugins.Add(webvttPlugin);
altyazikaydet("http://dizilab.com/captions/chuck/sezon-1/tr/2.vtt?v=5.2");
IsolatedStorageFile kayitliDepo = IsolatedStorageFile.GetUserStoreForApplication();
var okuyucu = new StreamReader(new IsolatedStorageFileStream("altyazi.vtt", FileMode.Open, kayitliDepo));
caption.Source = new Uri("i cant use here for access isostorage"); // url points to sample.vtt file
caption.Description = "Türkçe";
player.AvailableCaptions.Add(caption);
player.SelectedCaption = player.AvailableCaptions.FirstOrDefault();
LayoutRoot.Children.Add(player);
player.Source = new Uri("https://redirector.googlevideo.com/videoplayback?requiressl=yes&shardbypass=yes&cmbypass=yes&id=eafe5f42d368b2e0&itag=18&source=picasa&cmo=secure_transport%3Dyes&ip=0.0.0.0&ipbits=0&expire=1420976098&sparams=requiressl,shardbypass,cmbypass,id,itag,source,ip,ipbits,expire&signature=6E257266C2AAADDFC3260B0AADE603F7E421E130.A933FF8365247DEC72A34B71B02FA3B13C57F291&key=lh1", UriKind.RelativeOrAbsolute); // url points to sample.mp4 fil
}
private async void altyazikaydet(string altyaziurl)
{
try
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(altyaziurl);
using (var response = (HttpWebResponse)(await Task<WebResponse>.Factory.FromAsync(request.BeginGetResponse, request.EndGetResponse, null)))
{
using (var responseStream = response.GetResponseStream())
{
using (var sr = new StreamReader(responseStream))
{
alinanveri = await sr.ReadToEndAsync();
}
}
}
IsolatedStorageFile file = IsolatedStorageFile.GetUserStoreForApplication();
StreamWriter yazici = new StreamWriter(new IsolatedStorageFileStream("altyazi.vtt", FileMode.Create, file));
string x=alinanveri;
x=x.Replace(",0",".0");
x=x.Replace(",1",".1");
x=x.Replace(",2",".2");
x=x.Replace(",3",".3");
x=x.Replace(",4",".4");
x=x.Replace(",5",".5");
x=x.Replace(",6",".6");
x=x.Replace(",7",".7");
x=x.Replace(",8",".8");
x=x.Replace(",9",".9");
x = "WEBVTT FILE" + Environment.NewLine + x;
yazici.WriteLine(x);
yazici.Close();
IsolatedStorageFile kayitliDepo = IsolatedStorageFile.GetUserStoreForApplication();
StreamReader okuyucu = new StreamReader(new IsolatedStorageFileStream("altyazi.vtt", FileMode.Open, kayitliDepo));
string line;
while ((line = okuyucu.ReadLine()) != null)
{
MessageBox.Show(line);
}
}
catch
{
}
}
I had the same problem in Windows UWP.
The player.Source allows an absolute local URI but the webvtt plugin does not.
Finally, I found out how to make it work:
using System.IO;
var caption = new Caption
{
Description = "whatever",
Source = new Uri(Path.Combine("ms-appdata:///Local/", "test.vtt"))
};
Player.AvailableCaptions.Add(caption);
Player.SelectedCaption = Player.AvailableCaptions.FirstOrDefault();
Not sure if this work, but from this article , you could try this
string fullpath = "";
using (IsolatedStorageFile kayitliDepo = IsolatedStorageFile.GetUserStoreForApplication())
{
IsolatedStorageFileStream stream = kayitliDepo.OpenFile("altyazi.vtt", FileMode.Open,FileAccess.Read);
fullpath = stream.Name;
}
caption.Source = new Uri(Name,Urikind.Absolute); // url points to sample.vtt file
caption.Description = "Türkçe";
player.AvailableCaptions.Add(caption);
player.SelectedCaption = player.AvailableCaptions.FirstOrDefault();