rfc2047parse.c
No OneTemporary
Actions

Size

18 KB

Subscribers

None

rfc2047parse.c
View Options

	/* @file rfc2047parse.c
	* @brief Parsercode for rfc2047
	*
	* Copyright (C) 2015 by Bundesamt für Sicherheit in der Informationstechnik
	* Software engineering by Intevation GmbH
	*
	* This file is part of GpgOL.
	*
	* GpgOL is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* GpgOL is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public License
	* along with this program; if not, see <http://www.gnu.org/licenses/>.
	*/

	/* This code is heavily based (mostly verbatim copy with glib
	* dependencies removed) on GMime rev 496313fb
	* modified by aheinecke@intevation.de
	*
	* Copyright (C) 2000-2014 Jeffrey Stedfast
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public License
	* as published by the Free Software Foundation; either version 2.1
	* of the License, or (at your option) any later version.
	*/

	#ifdef HAVE_CONFIG_H
	#include <config.h>
	#endif

	#include <stdbool.h>
	#include "common_indep.h"
	#include <ctype.h>

	#ifdef HAVE_W32_SYSTEM
	# include "mlang-charset.h"
	#endif

	#include "gmime-table-private.h"

	/* mabye we need this at some point later? */
	#define G_MIME_RFC2047_WORKAROUNDS 1


	static unsigned char gmime_base64_rank[256] = {
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255, 62,255,255,255, 63,
	52, 53, 54, 55, 56, 57, 58, 59, 60, 61,255,255,255, 0,255,255,
	255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,255,255,255,255,255,
	255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
	};

	typedef struct _rfc2047_token {
	struct _rfc2047_token *next;
	char *charset;
	const char *text;
	size_t length;
	char encoding;
	char is_8bit;
	} rfc2047_token;

	static rfc2047_token *
	rfc2047_token_new (const char *text, size_t len)
	{
	rfc2047_token *token;

	TSTART;
	token = xmalloc (sizeof (rfc2047_token));
	memset (token, 0, sizeof (rfc2047_token));
	token->length = len;
	token->text = text;

	TRETURN token;
	}

	static rfc2047_token *
	rfc2047_token_new_encoded_word (const char *word, size_t len)
	{
	rfc2047_token *token;
	const char *payload;
	char *charset;
	const char *inptr;
	const char *tmpchar;
	char buf, lang;
	char encoding;
	size_t n;

	TSTART;
	/* check that this could even be an encoded-word token */
	if (len < 7 \|\| strncmp (word, "=?", 2) != 0 \|\| strncmp (word + len - 2, "?=", 2) != 0)
	{
	TRETURN NULL;
	}

	/* skip over '=?' */
	inptr = word + 2;
	tmpchar = inptr;

	if (tmpchar == '?' \|\| tmpchar == '*') {
	/* this would result in an empty charset */
	TRETURN NULL;
	}

	/* skip to the end of the charset */
	if (!(inptr = memchr (inptr, '?', len - 2)) \|\| inptr[2] != '?')
	{
	TRETURN NULL;
	}

	/* copy the charset into a buffer */
	n = (size_t) (inptr - tmpchar);
	buf = xmalloc (n + 1);
	memcpy (buf, tmpchar, n);
	buf[n] = '\0';
	charset = buf;

	/* rfc2231 updates rfc2047 encoded words...
	* The ABNF given in RFC 2047 for encoded-words is:
	* encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
	* This specification changes this ABNF to:
	* encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
	*/

	/* trim off the 'language' part if it's there... */
	if ((lang = strchr (charset, '*')))
	*lang = '\0';

	/* skip over the '?' */
	inptr++;

	/* make sure the first char after the encoding is another '?' */
	if (inptr[1] != '?')
	{
	TRETURN NULL;
	}

	switch (*inptr++) {
	case 'B': case 'b':
	encoding = 'B';
	break;
	case 'Q': case 'q':
	encoding = 'Q';
	break;
	default:
	TRETURN NULL;
	}

	/* the payload begins right after the '?' */
	payload = inptr + 1;

	/* find the end of the payload */
	inptr = word + len - 2;

	/* make sure that we don't have something like: =?iso-8859-1?Q?= */
	if (payload > inptr)
	{
	TRETURN NULL;
	}

	token = rfc2047_token_new (payload, inptr - payload);
	token->charset = charset;
	token->encoding = encoding;

	TRETURN token;
	}

	static void
	rfc2047_token_free (rfc2047_token * tok)
	{
	TSTART;
	if (!tok)
	{
	TRETURN;
	}
	xfree (tok->charset);
	xfree (tok);
	TRETURN;
	}

	static rfc2047_token *
	tokenize_rfc2047_phrase (const char in, size_t len)
	{
	bool enable_rfc2047_workarounds = G_MIME_RFC2047_WORKAROUNDS;
	rfc2047_token list, lwsp, token, *tail;
	register const char *inptr = in;
	bool encoded = false;
	const char text, word;
	bool ascii;
	size_t n;

	TSTART;
	tail = (rfc2047_token *) &list;
	list.next = NULL;
	lwsp = NULL;

	while (*inptr != '\0') {
	text = inptr;
	while (is_lwsp (*inptr))
	inptr++;

	if (inptr > text)
	lwsp = rfc2047_token_new (text, inptr - text);
	else
	lwsp = NULL;

	word = inptr;
	ascii = true;
	if (is_atom (*inptr)) {
	if (enable_rfc2047_workarounds) {
	/* Make an extra effort to detect and
	* separate encoded-word tokens that
	* have been merged with other
	* words. */

	if (!strncmp (inptr, "=?", 2)) {
	inptr += 2;

	/* skip past the charset (if one is even declared, sigh) */
	while (inptr && inptr != '?') {
	ascii = ascii && is_ascii (*inptr);
	inptr++;
	}

	/* sanity check encoding type */
	if (inptr[0] != '?' \|\| !strchr ("BbQq", inptr[1]) \|\| inptr[2] != '?')
	goto non_rfc2047;

	inptr += 3;

	/* find the end of the rfc2047 encoded word token */
	while (*inptr && strncmp (inptr, "?=", 2) != 0) {
	ascii = ascii && is_ascii (*inptr);
	inptr++;
	}

	if (*inptr == '\0') {
	/* didn't find an end marker... */
	inptr = word + 2;
	ascii = true;

	goto non_rfc2047;
	}

	inptr += 2;
	} else {
	non_rfc2047:
	/* stop if we encounter a possible rfc2047 encoded
	* token even if it's inside another word, sigh. */
	while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
	inptr++;
	}
	} else {
	while (is_atom (*inptr))
	inptr++;
	}

	n = (size_t) (inptr - word);
	if ((token = rfc2047_token_new_encoded_word (word, n))) {
	/* rfc2047 states that you must ignore all
	* whitespace between encoded words */
	if (!encoded && lwsp != NULL) {
	tail->next = lwsp;
	tail = lwsp;
	} else if (lwsp != NULL) {
	rfc2047_token_free (lwsp);
	}

	tail->next = token;
	tail = token;

	encoded = true;
	} else {
	/* append the lwsp and atom tokens */
	if (lwsp != NULL) {
	tail->next = lwsp;
	tail = lwsp;
	}

	token = rfc2047_token_new (word, n);
	token->is_8bit = ascii ? 0 : 1;
	tail->next = token;
	tail = token;

	encoded = false;
	}
	} else {
	/* append the lwsp token */
	if (lwsp != NULL) {
	tail->next = lwsp;
	tail = lwsp;
	}

	ascii = true;
	while (inptr && !is_lwsp (inptr) && !is_atom (*inptr)) {
	ascii = ascii && is_ascii (*inptr);
	inptr++;
	}

	n = (size_t) (inptr - word);
	token = rfc2047_token_new (word, n);
	token->is_8bit = ascii ? 0 : 1;

	tail->next = token;
	tail = token;

	encoded = false;
	}
	}

	*len = (size_t) (inptr - in);

	TRETURN list.next;
	}

	static void
	rfc2047_token_list_free (rfc2047_token * tokens)
	{
	TSTART;
	rfc2047_token * cur = tokens;
	while (cur)
	{
	rfc2047_token *next = cur->next;
	rfc2047_token_free (cur);
	cur = next;
	}
	TRETURN;
	}

	/* this decodes rfc2047's version of quoted-printable */
	static size_t
	quoted_decode (const unsigned char in, size_t len, unsigned char out, int state, unsigned int save)
	{
	register const unsigned char *inptr;
	register unsigned char *outptr;
	const unsigned char *inend;
	unsigned char c, c1;
	unsigned int saved;
	int need;

	TSTART;
	if (len == 0)
	{
	TRETURN 0;
	}

	inend = in + len;
	outptr = out;
	inptr = in;

	need = *state;
	saved = *save;

	if (need > 0) {
	if (isxdigit ((int) *inptr)) {
	if (need == 1) {
	c = toupper ((int) (saved & 0xff));
	c1 = toupper ((int) *inptr++);
	saved = 0;
	need = 0;

	goto decode;
	}

	saved = 0;
	need = 0;

	goto equals;
	}

	/* last encoded-word ended in a malformed quoted-printable sequence */
	*outptr++ = '=';

	if (need == 1)
	*outptr++ = (char) (saved & 0xff);

	saved = 0;
	need = 0;
	}

	while (inptr < inend) {
	c = *inptr++;
	if (c == '=') {
	equals:
	if (inend - inptr >= 2) {
	if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
	c = toupper (*inptr++);
	c1 = toupper (*inptr++);
	decode:
	*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
	\| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
	} else {
	/* malformed quoted-printable sequence? */
	*outptr++ = '=';
	}
	} else {
	/* truncated payload, maybe it was split across encoded-words? */
	if (inptr < inend) {
	if (isxdigit ((int) *inptr)) {
	saved = *inptr;
	need = 1;
	break;
	} else {
	/* malformed quoted-printable sequence? */
	*outptr++ = '=';
	}
	} else {
	saved = 0;
	need = 2;
	break;
	}
	}
	} else if (c == '_') {
	/* _'s are an rfc2047 shortcut for encoding spaces */
	*outptr++ = ' ';
	} else {
	*outptr++ = c;
	}
	}

	*state = need;
	*save = saved;

	TRETURN (size_t) (outptr - out);
	}

	/**
	* g_mime_encoding_base64_decode_step:
	* @inbuf: input buffer
	* @inlen: input buffer length
	* @outbuf: output buffer
	* @state: holds the number of bits that are stored in @save
	* @save: leftover bits that have not yet been decoded
	*
	* Decodes a chunk of base64 encoded data.
	*
	* Returns: the number of bytes decoded (which have been dumped in
	* @outbuf).
	**/
	size_t
	g_mime_encoding_base64_decode_step (const unsigned char inbuf, size_t inlen, unsigned char outbuf, int state, unsigned int save)
	{
	register const unsigned char *inptr;
	register unsigned char *outptr;
	const unsigned char *inend;
	register unsigned int saved;
	unsigned char c;
	int npad, n, i;

	TSTART;
	inend = inbuf + inlen;
	outptr = outbuf;
	inptr = inbuf;

	npad = (*state >> 8) & 0xff;
	n = *state & 0xff;
	saved = *save;

	/* convert 4 base64 bytes to 3 normal bytes */
	while (inptr < inend) {
	c = gmime_base64_rank[*inptr++];
	if (c != 0xff) {
	saved = (saved << 6) \| c;
	n++;
	if (n == 4) {
	*outptr++ = saved >> 16;
	*outptr++ = saved >> 8;
	*outptr++ = saved;
	n = 0;

	if (npad > 0) {
	outptr -= npad;
	npad = 0;
	}
	}
	}
	}

	/* quickly scan back for '=' on the end somewhere */
	/* fortunately we can drop 1 output char for each trailing '=' (up to 2) */
	for (i = 2; inptr > inbuf && i; ) {
	inptr--;
	if (gmime_base64_rank[*inptr] != 0xff) {
	if (*inptr == '=' && outptr > outbuf) {
	if (n == 0) {
	/* we've got a complete quartet so it's
	safe to drop an output character. */
	outptr--;
	} else if (npad < 2) {
	/* keep a record of the number of ='s at
	the end of the input stream, up to 2 */
	npad++;
	}
	}

	i--;
	}
	}

	*state = (npad << 8) \| n;
	*save = n ? saved : 0;

	TRETURN (outptr - outbuf);
	}

	static size_t
	rfc2047_token_decode (rfc2047_token token, unsigned char outbuf, int state, unsigned int save)
	{
	const unsigned char inbuf = (const unsigned char ) token->text;
	size_t len = token->length;

	TSTART;
	if (token->encoding == 'B')
	{
	TRETURN g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
	}
	else
	{
	TRETURN quoted_decode (inbuf, len, outbuf, state, save);
	}
	}

	static char *
	rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
	{
	rfc2047_token token, next;
	size_t outlen, len, tmplen;
	unsigned char *outptr;
	const char *charset;
	char *outbuf;
	char *decoded;
	char encoding;
	unsigned int save;
	int state;
	char *str;

	TSTART;
	decoded = xmalloc (buflen + 1);
	memset (decoded, 0, buflen + 1);
	tmplen = 76;
	outbuf = xmalloc (tmplen);

	token = tokens;
	while (token != NULL) {
	next = token->next;

	if (token->encoding) {
	/* In order to work around broken mailers, we need to combine
	* the raw decoded content of runs of identically encoded word
	* tokens before converting into UTF-8. */
	encoding = token->encoding;
	charset = token->charset;
	len = token->length;
	state = 0;
	save = 0;

	/* find the end of the run (and measure the buffer length we'll need) */
	while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
	len += next->length;
	next = next->next;
	}

	/* make sure our temporary output buffer is large enough... */
	if (len > tmplen)
	{
	outbuf = xrealloc (outbuf, len + 1);
	tmplen = len + 1;
	}

	/* base64 / quoted-printable decode each of the tokens... */
	outptr = outbuf;
	outlen = 0;
	do {
	/* Note: by not resetting state/save each loop, we effectively
	* treat the payloads as one continuous block, thus allowing
	* us to handle cases where a hex-encoded triplet of a
	* quoted-printable encoded payload is split between 2 or more
	* encoded-word tokens. */
	len = rfc2047_token_decode (token, outptr, &state, &save);
	token = token->next;
	outptr += len;
	outlen += len;
	} while (token != next);
	outptr = outbuf;

	/* convert the raw decoded text into UTF-8 */
	if (!strcasecmp (charset, "UTF-8")) {
	strncat (decoded, (char *) outptr, outlen);
	} else {
	#ifndef BUILD_TESTS
	str = ansi_charset_to_utf8 (charset, outptr, outlen, 0);
	#else
	log_debug ("%s:%s: Conversion not available for testing",
	SRCNAME, __func__);
	str = strdup (outptr);
	#endif
	if (!str)
	{
	log_error ("%s:%s: Failed conversion from: %s for word: %s.",
	SRCNAME, __func__, charset, anonstr (outptr));
	}
	else
	{
	strcat (decoded, str);
	xfree (str);
	}
	}
	} else {
	strncat (decoded, token->text, token->length);
	}
	if (token && token->is_8bit)
	{
	/* We don't support this. */
	log_error ("%s:%s: Unknown 8bit encoding detected.",
	SRCNAME, __func__);
	}

	token = next;
	}

	xfree (outbuf);

	TRETURN decoded;
	}


	/**
	* g_mime_utils_header_decode_phrase:
	* @phrase: header to decode
	*
	* Decodes an rfc2047 encoded 'phrase' header.
	*
	* Note: See g_mime_set_user_charsets() for details on how charset
	* conversion is handled for unencoded 8bit text and/or wrongly
	* specified rfc2047 encoded-word tokens.
	*
	* Returns: a newly allocated UTF-8 string representing the the decoded
	* header.
	**/
	static char *
	g_mime_utils_header_decode_phrase (const char *phrase)
	{
	rfc2047_token *tokens;
	char *decoded;
	size_t len;

	TSTART;
	tokens = tokenize_rfc2047_phrase (phrase, &len);
	decoded = rfc2047_decode_tokens (tokens, len);
	rfc2047_token_list_free (tokens);

	TRETURN decoded;
	}

	/* Try to parse an rfc 2047 filename for attachment handling.
	Returns the parsed string. On errors the input string is just
	copied with strdup */
	char *
	rfc2047_parse (const char *input)
	{
	char *decoded;
	TSTART;
	if (!input)
	{
	TRETURN xstrdup ("");
	}

	log_data ("%s:%s: Input: \"%s\"",
	SRCNAME, __func__, input);

	decoded = g_mime_utils_header_decode_phrase (input);

	log_data ("%s:%s: Decoded: \"%s\"",
	SRCNAME, __func__, decoded);

	if (!decoded \|\| !strlen (decoded))
	{
	xfree (decoded);
	TRETURN xstrdup (input);
	}
	TRETURN decoded;
	}

File Metadata

Mime Type: text/x-c
Expires: Mon, Dec 23, 4:42 PM (10 h, 14 m)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 5a/0c/42c64d16c4621449126628093442

rfc2047parse.cNo OneTemporaryActions

rfc2047parse.cView Options

File Metadata

Event Timeline

rfc2047parse.c
No OneTemporary
Actions

rfc2047parse.c
View Options