001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.output;
018
019import java.io.File;
020import java.io.FileNotFoundException;
021import java.io.FileOutputStream;
022import java.io.IOException;
023import java.io.OutputStream;
024import java.io.OutputStreamWriter;
025import java.io.StringWriter;
026import java.io.Writer;
027import java.nio.charset.Charset;
028import java.nio.charset.StandardCharsets;
029import java.util.Locale;
030import java.util.Objects;
031import java.util.regex.Matcher;
032
033import org.apache.commons.io.Charsets;
034import org.apache.commons.io.IOUtils;
035import org.apache.commons.io.build.AbstractStreamBuilder;
036import org.apache.commons.io.input.XmlStreamReader;
037
038/**
039 * Character stream that handles all the necessary work to figure out the charset encoding of the XML document written to the stream.
040 * <p>
041 * To build an instance, see {@link Builder}.
042 * </p>
043 *
044 * @see XmlStreamReader
045 * @since 2.0
046 */
047public class XmlStreamWriter extends Writer {
048
049    /**
050     * Builds a new {@link XmlStreamWriter} instance.
051     * <p>
052     * For example:
053     * </p>
054     * <pre>{@code
055     * WriterOutputStream w = WriterOutputStream.builder()
056     *   .setPath(path)
057     *   .setCharset(StandardCharsets.UTF_8)
058     *   .get();}
059     * </pre>
060     *
061     * @since 2.12.0
062     */
063    public static class Builder extends AbstractStreamBuilder<XmlStreamWriter, Builder> {
064
065        public Builder() {
066            setCharsetDefault(StandardCharsets.UTF_8);
067            setCharset(StandardCharsets.UTF_8);
068        }
069
070        /**
071         * Constructs a new instance.
072         * <p>
073         * This builder use the aspect OutputStream, OpenOption[], and Charset.
074         * </p>
075         * <p>
076         * You must provide an origin that can be converted to an OutputStream by this builder, otherwise, this call will throw an
077         * {@link UnsupportedOperationException}.
078         * </p>
079         *
080         * @return a new instance.
081         * @throws UnsupportedOperationException if the origin cannot provide an OutputStream.
082         * @throws IOException                   if an I/O error occurs.
083         * @see #getOutputStream()
084         */
085        @SuppressWarnings("resource")
086        @Override
087        public XmlStreamWriter get() throws IOException {
088            return new XmlStreamWriter(getOutputStream(), getCharset());
089        }
090
091    }
092
093    private static final int BUFFER_SIZE = IOUtils.DEFAULT_BUFFER_SIZE;
094
095    /**
096     * Constructs a new {@link Builder}.
097     *
098     * @return a new {@link Builder}.
099     * @since 2.12.0
100     */
101    public static Builder builder() {
102        return new Builder();
103    }
104
105    private final OutputStream out;
106
107    private final Charset defaultCharset;
108
109    private StringWriter prologWriter = new StringWriter(BUFFER_SIZE);
110
111    private Writer writer;
112
113    private Charset charset;
114
115    /**
116     * Constructs a new XML stream writer for the specified file
117     * with a default encoding of UTF-8.
118     *
119     * @param file The file to write to
120     * @throws FileNotFoundException if there is an error creating or
121     * opening the file
122     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
123     */
124    @Deprecated
125    public XmlStreamWriter(final File file) throws FileNotFoundException {
126        this(file, null);
127    }
128
129    /**
130     * Constructs a new XML stream writer for the specified file
131     * with the specified default encoding.
132     *
133     * @param file The file to write to
134     * @param defaultEncoding The default encoding if not encoding could be detected
135     * @throws FileNotFoundException if there is an error creating or
136     * opening the file
137     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
138     */
139    @Deprecated
140    @SuppressWarnings("resource")
141    public XmlStreamWriter(final File file, final String defaultEncoding) throws FileNotFoundException {
142        this(new FileOutputStream(file), defaultEncoding);
143    }
144
145    /**
146     * Constructs a new XML stream writer for the specified output stream
147     * with a default encoding of UTF-8.
148     *
149     * @param out The output stream
150     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
151     */
152    @Deprecated
153    public XmlStreamWriter(final OutputStream out) {
154        this(out, StandardCharsets.UTF_8);
155    }
156
157    /**
158     * Constructs a new XML stream writer for the specified output stream
159     * with the specified default encoding.
160     *
161     * @param out The output stream
162     * @param defaultEncoding The default encoding if not encoding could be detected
163     */
164    private XmlStreamWriter(final OutputStream out, final Charset defaultEncoding) {
165        this.out = out;
166        this.defaultCharset = Objects.requireNonNull(defaultEncoding);
167    }
168
169    /**
170     * Constructs a new XML stream writer for the specified output stream
171     * with the specified default encoding.
172     *
173     * @param out The output stream
174     * @param defaultEncoding The default encoding if not encoding could be detected
175     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
176     */
177    @Deprecated
178    public XmlStreamWriter(final OutputStream out, final String defaultEncoding) {
179        this(out, Charsets.toCharset(defaultEncoding, StandardCharsets.UTF_8));
180    }
181
182    /**
183     * Closes the underlying writer.
184     *
185     * @throws IOException if an error occurs closing the underlying writer
186     */
187    @Override
188    public void close() throws IOException {
189        if (writer == null) {
190            charset = defaultCharset;
191            writer = new OutputStreamWriter(out, charset);
192            writer.write(prologWriter.toString());
193        }
194        writer.close();
195    }
196
197    /**
198     * Detects the encoding.
199     *
200     * @param cbuf the buffer to write the characters from
201     * @param off The start offset
202     * @param len The number of characters to write
203     * @throws IOException if an error occurs detecting the encoding
204     */
205    private void detectEncoding(final char[] cbuf, final int off, final int len)
206            throws IOException {
207        int size = len;
208        final StringBuffer xmlProlog = prologWriter.getBuffer();
209        if (xmlProlog.length() + len > BUFFER_SIZE) {
210            size = BUFFER_SIZE - xmlProlog.length();
211        }
212        prologWriter.write(cbuf, off, size);
213
214        // try to determine encoding
215        if (xmlProlog.length() >= 5) {
216            if (xmlProlog.substring(0, 5).equals("<?xml")) {
217                // try to extract encoding from XML prolog
218                final int xmlPrologEnd = xmlProlog.indexOf("?>");
219                if (xmlPrologEnd > 0) {
220                    // ok, full XML prolog written: let's extract encoding
221                    final Matcher m = XmlStreamReader.ENCODING_PATTERN.matcher(xmlProlog.substring(0,
222                            xmlPrologEnd));
223                    if (m.find()) {
224                        final String encName = m.group(1).toUpperCase(Locale.ROOT);
225                        charset = Charset.forName(encName.substring(1, encName.length() - 1));
226                    } else {
227                        // no encoding found in XML prolog: using default
228                        // encoding
229                        charset = defaultCharset;
230                    }
231                } else if (xmlProlog.length() >= BUFFER_SIZE) {
232                    // no encoding found in first characters: using default
233                    // encoding
234                    charset = defaultCharset;
235                }
236            } else {
237                // no XML prolog: using default encoding
238                charset = defaultCharset;
239            }
240            if (charset != null) {
241                // encoding has been chosen: let's do it
242                prologWriter = null;
243                writer = new OutputStreamWriter(out, charset);
244                writer.write(xmlProlog.toString());
245                if (len > size) {
246                    writer.write(cbuf, off + size, len - size);
247                }
248            }
249        }
250    }
251
252    /**
253     * Flushes the underlying writer.
254     *
255     * @throws IOException if an error occurs flushing the underlying writer
256     */
257    @Override
258    public void flush() throws IOException {
259        if (writer != null) {
260            writer.flush();
261        }
262    }
263
264    /**
265     * Returns the default encoding.
266     *
267     * @return the default encoding
268     */
269    public String getDefaultEncoding() {
270        return defaultCharset.name();
271    }
272
273    /**
274     * Returns the detected encoding.
275     *
276     * @return the detected encoding
277     */
278    public String getEncoding() {
279        return charset.name();
280    }
281
282    /**
283     * Writes the characters to the underlying writer, detecting encoding.
284     *
285     * @param cbuf the buffer to write the characters from
286     * @param off The start offset
287     * @param len The number of characters to write
288     * @throws IOException if an error occurs detecting the encoding
289     */
290    @Override
291    public void write(final char[] cbuf, final int off, final int len) throws IOException {
292        if (prologWriter != null) {
293            detectEncoding(cbuf, off, len);
294        } else {
295            writer.write(cbuf, off, len);
296        }
297    }
298}