fish-shell/tokenizer.h

/** \file tokenizer.h 

    A specialized tokenizer for tokenizing the fish language. In the
    future, the tokenizer should be extended to support marks,
    tokenizing multiple strings and disposing of unused string
    segments.
*/

#ifndef FISH_TOKENIZER_H
#define FISH_TOKENIZER_H

#include <wchar.h>

/**
   Token types
*/
enum token_type
{
	TOK_NONE, /**< Tokenizer not yet constructed */
	TOK_ERROR, /**< Error reading token */
	TOK_INVALID,/**< Invalid token */
	TOK_STRING,/**< String token */
	TOK_PIPE,/**< Pipe token */
	TOK_END,/**< End token */
	TOK_REDIRECT_OUT, /**< redirection token */
	TOK_REDIRECT_APPEND,/**< redirection append token */
	TOK_REDIRECT_IN,/**< input redirection token */
	TOK_REDIRECT_FD,/**< redirection to new fd token */
	TOK_REDIRECT_NOCLOB, /**<? redirection token */
	TOK_BACKGROUND,/**< send job to bg token */
	TOK_COMMENT/**< comment token */
};

/**
   Tokenizer error types
*/
enum tokenizer_error
{
	TOK_UNTERMINATED_QUOTE,
	TOK_UNTERMINATED_SUBSHELL,
	TOK_UNTERMINATED_ESCAPE,
	TOK_OTHER
}
	;


/**
   Flag telling the tokenizer to accept incomplete parameters,
   i.e. parameters with mismatching paranthesis, etc. This is useful
   for tab-completion.
*/
#define TOK_ACCEPT_UNFINISHED 1

/**
   Flag telling the tokenizer not to remove comments. Useful for
   syntax highlighting.
*/
#define TOK_SHOW_COMMENTS 2

/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).
*/
#define TOK_SQUASH_ERRORS 4


/**
   The tokenizer struct. 
*/
struct tokenizer
{
	/** A pointer into the original string, showing where the next token begins */
	const wchar_t *buff;
	/** A copy of the original string */
	const wchar_t *orig_buff;
	/** A pointer to the last token*/
	wchar_t *last;
	
	/** Type of last token*/
	int last_type;
	/** Length of last token*/
	size_t last_len;
	/** Offset of last token*/
	size_t last_pos;
	/** Whether there are more tokens*/
	bool has_next;
	/** Whether incomplete tokens are accepted*/
	bool accept_unfinished;
	/** Whether commants should be returned*/
	bool show_comments;
	/** Type of last quote, can be either ' or ".*/
	wchar_t last_quote;
	/** Last error */
	int error;
    /* Whether we are squashing errors */
    bool squash_errors;

    /* Cached line number information */
    size_t cached_lineno_offset;
    int cached_lineno_count;

    /** Return the line number of the character at the given offset */
    int line_number_of_character_at_offset(size_t offset);

};

/**
  Initialize the tokenizer. b is the string that is to be
  tokenized. It is not copied, and should not be freed by the caller
  until after the tokenizer is destroyed.

  \param tok The tokenizer to initialize
  \param b The string to tokenize
  \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
  to accept incomplete tokens, such as a subshell without a closing
  parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens
  
*/
void tok_init( tokenizer *tok, const wchar_t *b, int flags );

/**
  Jump to the next token.
*/
void tok_next( tokenizer *tok );

/**
  Returns the type of the last token. Must be one of the values in the token_type enum.
*/
int tok_last_type( tokenizer *tok );

/**
  Returns the last token string. The string should not be freed by the caller.
*/
wchar_t *tok_last( tokenizer *tok );

/**
  Returns the type of quote from the last TOK_QSTRING
*/
wchar_t tok_last_quote( tokenizer *tok );

/**
  Returns true as long as there are more tokens left
*/
int tok_has_next( tokenizer *tok );

/**
  Returns the position of the beginning of the current token in the original string
*/
int tok_get_pos( tokenizer *tok );

/**
   Destroy the tokenizer and free asociated memory
*/
void tok_destroy( tokenizer *tok );


/**
   Returns the original string to tokenizer
 */
const wchar_t *tok_string( tokenizer *tok );


/**
   Returns only the first token from the specified string. This is a
   convenience function, used to retrieve the first token of a
   string. This can be useful for error messages, etc.

   The string should be freed. After use.
*/
wchar_t *tok_first( const wchar_t *str );

/**
   Move tokenizer position
*/
void tok_set_pos( tokenizer *tok, int pos );

/**
   Returns a string description of the specified token type
*/
const wchar_t *tok_get_desc( int type );

/**
   Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR.
*/
int tok_get_error( tokenizer *tok );


#endif
Some changes to migrate towards C++ and a multithreaded model 2011-12-27 03:18:46 +00:00			`/** \file tokenizer.h`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00
			`A specialized tokenizer for tokenizing the fish language. In the`
			`future, the tokenizer should be extended to support marks,`
			`tokenizing multiple strings and disposing of unused string`
			`segments.`
			`*/`

Add header guards to the header files. darcs-hash:20051004151139-35ec8-7af69b9d7647d145dc621f7eaea726e729cff554.gz 2005-10-04 15:11:39 +00:00			`#ifndef FISH_TOKENIZER_H`
			`#define FISH_TOKENIZER_H`

			`#include <wchar.h>`

Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/**`
			`Token types`
			`*/`
			`enum token_type`
			`{`
			`TOK_NONE, /*< Tokenizer not yet constructed /`
			`TOK_ERROR, /*< Error reading token /`
			`TOK_INVALID,/*< Invalid token /`
			`TOK_STRING,/*< String token /`
			`TOK_PIPE,/*< Pipe token /`
			`TOK_END,/*< End token /`
			`TOK_REDIRECT_OUT, /*< redirection token /`
			`TOK_REDIRECT_APPEND,/*< redirection append token /`
			`TOK_REDIRECT_IN,/*< input redirection token /`
			`TOK_REDIRECT_FD,/*< redirection to new fd token /`
Implement non-clobbering file io. Use the >? operator for this for now. darcs-hash:20071026184232-75c98-11edcbc7548c8ad3a2d4b648cb7ae18067569f02.gz 2007-10-26 18:42:32 +00:00			`TOK_REDIRECT_NOCLOB, /*<? redirection token /`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`TOK_BACKGROUND,/*< send job to bg token /`
			`TOK_COMMENT/*< comment token /`
Initial work towards making autosuggestion smarter by recognizing paths 2012-02-15 19:33:41 +00:00			`};`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 00:56:25 +00:00
			`/**`
			`Tokenizer error types`
			`*/`
			`enum tokenizer_error`
			`{`
			`TOK_UNTERMINATED_QUOTE,`
			`TOK_UNTERMINATED_SUBSHELL,`
			`TOK_UNTERMINATED_ESCAPE,`
			`TOK_OTHER`
			`}`
			`;`

Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00
			`/**`
			`Flag telling the tokenizer to accept incomplete parameters,`
			`i.e. parameters with mismatching paranthesis, etc. This is useful`
			`for tab-completion.`
			`*/`
			`#define TOK_ACCEPT_UNFINISHED 1`

			`/**`
			`Flag telling the tokenizer not to remove comments. Useful for`
			`syntax highlighting.`
			`*/`
			`#define TOK_SHOW_COMMENTS 2`

Fix a crash when using quotes due to wgettext thread safety issues. 2012-02-17 23:55:54 +00:00			`/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).`
			`*/`
			`#define TOK_SQUASH_ERRORS 4`

Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00
			`/**`
Some changes to migrate towards C++ and a multithreaded model 2011-12-27 03:18:46 +00:00			`The tokenizer struct.`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`*/`
More work on the instanced parser 2012-01-23 04:47:13 +00:00			`struct tokenizer`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`{`
			`/** A pointer into the original string, showing where the next token begins */`
Added some const correctness 2012-02-24 17:32:15 +00:00			`const wchar_t *buff;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** A copy of the original string */`
Added some const correctness 2012-02-24 17:32:15 +00:00			`const wchar_t *orig_buff;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** A pointer to the last token*/`
			`wchar_t *last;`
Some changes to migrate towards C++ and a multithreaded model 2011-12-27 03:18:46 +00:00
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Type of last token*/`
			`int last_type;`
			`/** Length of last token*/`
More const and signed correctness. Warnings now fit on one page! 2012-01-15 06:48:53 +00:00			`size_t last_len;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Offset of last token*/`
Lots of work towards making fish build without warnings on Mountain Lion, mostly in terms of using size_t instead of int 2012-08-01 23:32:52 +00:00			`size_t last_pos;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Whether there are more tokens*/`
Lots of work towards making fish build without warnings on Mountain Lion, mostly in terms of using size_t instead of int 2012-08-01 23:32:52 +00:00			`bool has_next;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Whether incomplete tokens are accepted*/`
Lots of work towards making fish build without warnings on Mountain Lion, mostly in terms of using size_t instead of int 2012-08-01 23:32:52 +00:00			`bool accept_unfinished;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Whether commants should be returned*/`
Lots of work towards making fish build without warnings on Mountain Lion, mostly in terms of using size_t instead of int 2012-08-01 23:32:52 +00:00			`bool show_comments;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`/** Type of last quote, can be either ' or ".*/`
			`wchar_t last_quote;`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 00:56:25 +00:00			`/** Last error */`
			`int error;`
Fix a crash when using quotes due to wgettext thread safety issues. 2012-02-17 23:55:54 +00:00			`/* Whether we are squashing errors */`
			`bool squash_errors;`
Fix to restore an optimization from parse_util_get_line_from_offset in a more thread-safe way 2012-08-05 00:44:14 +00:00
			`/* Cached line number information */`
			`size_t cached_lineno_offset;`
			`int cached_lineno_count;`

			`/** Return the line number of the character at the given offset */`
			`int line_number_of_character_at_offset(size_t offset);`

More work on the instanced parser 2012-01-23 04:47:13 +00:00			`};`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00
			`/**`
			`Initialize the tokenizer. b is the string that is to be`
			`tokenized. It is not copied, and should not be freed by the caller`
			`until after the tokenizer is destroyed.`

			`\param tok The tokenizer to initialize`
			`\param b The string to tokenize`
			`\param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer`
			`to accept incomplete tokens, such as a subshell without a closing`
			`parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens`
Some changes to migrate towards C++ and a multithreaded model 2011-12-27 03:18:46 +00:00
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00			`*/`
			`void tok_init( tokenizer tok, const wchar_t b, int flags );`

			`/**`
			`Jump to the next token.`
			`*/`
			`void tok_next( tokenizer *tok );`

			`/**`
			`Returns the type of the last token. Must be one of the values in the token_type enum.`
			`*/`
			`int tok_last_type( tokenizer *tok );`

			`/**`
			`Returns the last token string. The string should not be freed by the caller.`
			`*/`
			`wchar_t tok_last( tokenizer tok );`

			`/**`
			`Returns the type of quote from the last TOK_QSTRING`
			`*/`
			`wchar_t tok_last_quote( tokenizer *tok );`

			`/**`
			`Returns true as long as there are more tokens left`
			`*/`
			`int tok_has_next( tokenizer *tok );`

			`/**`
			`Returns the position of the beginning of the current token in the original string`
			`*/`
			`int tok_get_pos( tokenizer *tok );`

			`/**`
			`Destroy the tokenizer and free asociated memory`
			`*/`
			`void tok_destroy( tokenizer *tok );`


			`/**`
			`Returns the original string to tokenizer`
			`*/`
Added some const correctness 2012-02-24 17:32:15 +00:00			`const wchar_t tok_string( tokenizer tok );`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 13:26:39 +00:00

			`/**`
			`Returns only the first token from the specified string. This is a`
			`convenience function, used to retrieve the first token of a`
			`string. This can be useful for error messages, etc.`

			`The string should be freed. After use.`
			`*/`
			`wchar_t tok_first( const wchar_t str );`

			`/**`
			`Move tokenizer position`
			`*/`
			`void tok_set_pos( tokenizer *tok, int pos );`

			`/**`
			`Returns a string description of the specified token type`
			`*/`
			`const wchar_t *tok_get_desc( int type );`

Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 00:56:25 +00:00			`/**`
			`Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR.`
			`*/`
			`int tok_get_error( tokenizer *tok );`


Add header guards to the header files. darcs-hash:20051004151139-35ec8-7af69b9d7647d145dc621f7eaea726e729cff554.gz 2005-10-04 15:11:39 +00:00			`#endif`