WebSVN – DevTools – /EA_DocGen/trunk/EA_DocGen/TextParser.cs

using System;
using System.Collections;
using System.Text;
using Word;

// TODO
//
// 1) Make more efficient by using style IDs instead of names
// 2) Possible let user and a tag simply with </>



namespace EA_DocGen
{
        /// <summary>
        /// Summary description for TextParser.
        /// </summary>
        public class TextParser
        {
      // An embedded formatting tag can be one of two types
      // * An EA_DocGen special format
      // * An MS-Word Style based format
      private enum style_type_e
      {
         STYLE_EA_DOCGEN = 0,
         STYLE_MS_WORD,
         STYLE_UNKNOWN = -1
      };
    
      // A token is a block of text associated with a tag (style) name and type
      private struct token_type
      {
         public string txt;
         public style_type_e styleType;
         public string styleName;
         public int gen1;
      };

      // Some formatting has to be applied after ALL the text of a description has been 
      // appended to the word document. For this formatting, we need to remember the word
      // ranges that denote the text and the style name to apply.
      private struct postFormat_type
      {
         public postFormat_type(Word.Range wr, token_type tk)
         {
            m_wr = wr;
            m_tk = tk;
         }
         public Word.Range m_wr;
         public token_type m_tk;
      };

      // Use a hash table for recording the allowed tags and their attributes, facilitating rapid
      // lookup during parsing.
      private static Hashtable styleDefs = null;


      /// <summary>
      /// Class initialisation function
      /// </summary>
      public static void initialise()
      {
         // initialsie the hash table
         styleDefs = new Hashtable();

         // This list dictates what tags a user can use in the notes text of an EA element. When adding new items
         // to this list, we have to update the parsing function obviously.

         // The hash key is the short tag name that end-users will use in their descriptions. Tags found in 
         // user text is matched to these keys, and the style definition if found can then be used.

         // EA_DocGen tags
         styleDefs.Add( EA_Constants.EA_DocGenTable, formStyleDef( style_type_e.STYLE_EA_DOCGEN, EA_Constants.EA_DocGenTable, 0 ) );

         // MS-Word formatting tags
         styleDefs.Add( "b", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Bold      ,0) );
         styleDefs.Add( "i", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Italic    ,0) );
         styleDefs.Add( "u", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Underline ,0) );

         styleDefs.Add( "lb0", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListBullet0,1 ) );
         styleDefs.Add( "lb1", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListBullet1,2 ) );
         styleDefs.Add( "lb2", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListBullet2,3 ) );
         styleDefs.Add( "lb3", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListBullet3,4 ) );
         styleDefs.Add( "lb4", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListBullet4,5 ) );
         styleDefs.Add( "ln0", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListNumber0,1 ) );
         styleDefs.Add( "ln1", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListNumber1,2 ) );
         styleDefs.Add( "ln2", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListNumber2,3 ) );
         styleDefs.Add( "ln3", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListNumber3,4 ) );
         styleDefs.Add( "ln4", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListNumber4,5 ) );
         styleDefs.Add( "li0", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListIndent0,1 ) );
         styleDefs.Add( "li1", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListIndent1,2 ) );
         styleDefs.Add( "li2", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListIndent2,3 ) );
         styleDefs.Add( "li3", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListIndent3,4 ) );
         styleDefs.Add( "li4", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_ListIndent4,5 ) );
         styleDefs.Add( "la0", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_AlphaList0, 1 ) );
         styleDefs.Add( "la1", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_AlphaList1, 2 ) );
         styleDefs.Add( "la2", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_AlphaList2, 3 ) );
         styleDefs.Add( "la3", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_AlphaList3, 4 ) );
         styleDefs.Add( "la4", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_AlphaList4, 5 ) );

         styleDefs.Add( "code", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_CodeText, 0 ) );
         styleDefs.Add( "normal", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Normal  , 0 ) );
         styleDefs.Add( "note", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Note  , 0 ) );
         styleDefs.Add( "warn", formStyleDef( style_type_e.STYLE_MS_WORD, EA_Constants.styleName_Warning , 0 ) );

      }

      /// <summary>
      /// Helper for initialise() function
      /// </summary>
      /// <param name="styleType"></param>
      /// <param name="styleName"></param>
      /// <returns></returns>
      private static token_type formStyleDef(style_type_e styleType, string styleName, int genvar1)
      {
         token_type tokenType = new token_type();
         tokenType.styleType = styleType;
         tokenType.styleName = styleName;
         tokenType.txt = null;
         tokenType.gen1 = genvar1;
         return tokenType;
      }



      /// <summary>
      /// Parse the notes of an element and use the results to form document content.
      /// </summary>
      /// <param name="theElement"></param>
      /// <param name="defaultStyle"></param>
      /// <returns></returns>
      public static bool parse(string s, int id, string defaultStyle, float indent_pts, bool continuation)
      {
         int pos;
         int pos_LeftBracket = 0;
         int pos_RightBracket = 0;
         int pos_tagName;
         int pos_ContentStart;
         int pos_ContentEnd;
         bool lookingForTagEnd;
         bool foundError = false;
         Word.Range wr_body;

         // get relative (to 2.5cm) indentation user has commanded. At 2.5cm, the pts value is 70.866. We only indent, never outdent
         // so end stop at 0.
         float relative_indent_adjustment = 0;
         if (indent_pts > 0)
         {
            relative_indent_adjustment = indent_pts - (float)70.866;
            if (relative_indent_adjustment < 0)
               relative_indent_adjustment = 0;
         }

         token_type token;
         ArrayList tokens = new ArrayList();

         // default starting token - may be updated later
         token.styleName = EA_Constants.styleName_Body1;
         token.styleType = style_type_e.STYLE_MS_WORD;
         token.txt = null;
         token.gen1 = 0;

         lookingForTagEnd = false;
         pos_ContentStart = 0;

         // look for a tag
         pos = s.IndexOf("<", 0);
         while ((pos >= 0) && (pos < s.Length))
         {
            // record position of tag
            pos_LeftBracket = pos;

            // tag name begins at the next char
            pos_tagName = pos_LeftBracket + 1;

            // Check if this is a closing tag
            bool isEnding = false;
            if (pos < (s.Length-1))
            {
               if (s[pos+1] == '/')
               {
                  // skip past the / char
                  isEnding = true;
                  pos_tagName++;
               }
            }
            
            // We found a possible tag, now figure out if this is one of the tags we recognise
            bool found = false;

            // look for the closing bracket of the tag
            pos = s.IndexOf(">", pos+1);
            if (pos >= 0)
            {
               found = styleDefs.Contains(s.Substring(pos_tagName, pos - pos_tagName));
            }
            else
            {
               // Cannot find any '>' so we should just exit the loop
               break;
            }

            // if the tag was recognised
            if (found)
            {
               // record position of the closing bracket of the tag
               pos_RightBracket = pos;

               // if this is an end tag, ie. </tagname>
               if (isEnding)
               {
                  pos_ContentEnd = pos_LeftBracket - 1;  // not sure if we really need to compute pos_ContentEnd
                
                  // check for out of sequence error
                  if (!lookingForTagEnd)
                  {
                     if (!foundError)
                     {
                        Main.WriteOutput( string.Format("ERROR, Found out of sequence style tag ({0}), generated document text may be formatted incorrectly.", s.Substring(pos_LeftBracket, pos_RightBracket - pos_LeftBracket + 1)), id);
                        foundError = true;
                     }
                  }
                  else
                  {
                     // Check that the end tag matches the start tag
                     token_type tt = ((token_type)styleDefs[s.Substring(pos_tagName, pos_RightBracket-pos_tagName)]);
                     if (token.styleName == tt.styleName && 
                        token.styleType == tt.styleType)
                     {
                        // Update the token's text field and add the now complete token to our list
                        // for processing a little later on.
                        token.txt = s.Substring(pos_ContentStart, pos_LeftBracket - pos_ContentStart);
                        tokens.Add(token);

                        // re-initialise token for next tag search
                        token.styleName = EA_Constants.styleName_Body1;
                        token.styleType = style_type_e.STYLE_MS_WORD;
                        token.txt = null;

                        lookingForTagEnd = false;

                        pos_ContentStart = pos_RightBracket + 1;
                     }
                     else
                     {
                        // end tag does not seem to be the same as the starting tag, so ignore it
                        if (!foundError)
                        {
                           Main.WriteOutput(string.Format("ERROR, Found unmatched style tag ({0}), generated document text may be formatted incorrectly.", s.Substring(pos_LeftBracket, pos_RightBracket - pos_LeftBracket + 1)), id);
                           foundError = true;
                        }
                     }
                  }
               }
               else
               {
                  // If there is content prior to now that has not been consumed, tokenise it now
                  if ((pos_LeftBracket - pos_ContentStart) > 0)
                  {
                     token.txt = s.Substring(pos_ContentStart, pos_LeftBracket - pos_ContentStart);
                     tokens.Add(token);
                  }
                
                  if (lookingForTagEnd)
                  {
                     if (!foundError)
                     {
                        Main.WriteOutput(string.Format("ERROR, Found nested style tag ({0}), generated document text may be formatted incorrectly.", s.Substring(pos_LeftBracket, pos_RightBracket - pos_LeftBracket + 1)), id);
                        foundError = true;
                     }
                  }
                  else
                  {
                     // update the token variable with this tags atributes from the hash table lookup
                     token_type tt = ((token_type)styleDefs[s.Substring(pos_tagName, pos_RightBracket-pos_tagName)]);
                     token.styleName = tt.styleName;
                     token.styleType = tt.styleType;
                     token.gen1      = tt.gen1;
                     token.txt = null; // we dont know what the text content will be yet. This is obtained when we encounter the end tag

                     pos_ContentStart = pos_RightBracket + 1;

                     lookingForTagEnd = true;
                  }
               }
            }
            else
            {
               // the tag was not recognised so for now we just treat it as if it were plain text and continue
               //pos++;
            }

            // look for next tag
            pos = s.IndexOf("<", pos);

         } // end of the loop


         // POST-LOOP operations

         // take care of the last token, if there is one
         if (pos_ContentStart < s.Length)
         {
            // Update the token's text field
            token.txt = s.Substring(pos_ContentStart, s.Length - pos_ContentStart);
            tokens.Add(token);
         }

         if (lookingForTagEnd)
         {
            if (!foundError)
            {
               Main.WriteOutput(string.Format("ERROR, Found incomplete style tag ({0}), generated document text may be formatted incorrectly.", s.Substring(pos_LeftBracket, pos_RightBracket - pos_LeftBracket + 1)), id);
               foundError = true;
            }
         }

         
         ArrayList postFormats = new ArrayList();

         // Now process all the tokens we have found
         foreach (token_type tt in tokens)
         {
            if (tt.txt != null && tt.txt.Length > 0)
            {

               switch (tt.styleType)
               {
                  case style_type_e.STYLE_EA_DOCGEN:
                     switch (tt.styleName)
                     {
                        case EA_Constants.EA_DocGenTable:
                           TabularContent.processTableElement(tt.txt, 0, indent_pts);
                           continuation = false;

                           // flag list numbering restart
                           postFormats.Add( new postFormat_type(null, tt) );
                           break;

                        default:
                           break;
                     }
                     break;

                  case style_type_e.STYLE_MS_WORD:
                     switch (tt.styleName)
                     {
                        // List all formatting that has to be done after all of the text has been inserted into the 
                        // document ie. post-formatting
                        case EA_Constants.styleName_Bold:
                        case EA_Constants.styleName_Italic:
                        case EA_Constants.styleName_Underline:
                        case EA_Constants.styleName_ListBullet0:
                        case EA_Constants.styleName_ListBullet1:
                        case EA_Constants.styleName_ListBullet2:
                        case EA_Constants.styleName_ListBullet3:
                        case EA_Constants.styleName_ListBullet4:
                        case EA_Constants.styleName_ListNumber0:
                        case EA_Constants.styleName_ListNumber1:
                        case EA_Constants.styleName_ListNumber2:
                        case EA_Constants.styleName_ListNumber3:
                        case EA_Constants.styleName_ListNumber4:
                        case EA_Constants.styleName_ListIndent0:
                        case EA_Constants.styleName_ListIndent1:
                        case EA_Constants.styleName_ListIndent2:
                        case EA_Constants.styleName_ListIndent3:
                        case EA_Constants.styleName_ListIndent4:
                        case EA_Constants.styleName_AlphaList0 :
                        case EA_Constants.styleName_AlphaList1 :
                        case EA_Constants.styleName_AlphaList2 :
                        case EA_Constants.styleName_AlphaList3 :
                        case EA_Constants.styleName_AlphaList4 :
                           wr_body = TextualContent.appendAndSelectText( tt.txt, defaultStyle, continuation );
                           continuation = true;
                           if (wr_body.Characters.Last.Text.Equals("\r"))
                              wr_body.End = wr_body.End - 1;  // don't format the \r char at the end - doing so causes wierd ms-word exceptions later on
                           postFormats.Add( new postFormat_type(wr_body, tt) );
                           break;

                        case EA_Constants.styleName_CodeText:
                        case EA_Constants.styleName_Normal  :
                        case EA_Constants.styleName_Note    :
                        case EA_Constants.styleName_Warning :
                           wr_body = TextualContent.appendAndSelectText( tt.txt, tt.styleName, continuation );
                           continuation = true;
                           if (indent_pts > 0)
                              wr_body.ParagraphFormat.LeftIndent = indent_pts;

                           // flag list numbering restart
                           postFormats.Add( new postFormat_type(null, tt) );
                           break;


                        // List all other formatting that can be done immediately.
                        case EA_Constants.styleName_Body1:
                           wr_body = TextualContent.appendAndSelectText( tt.txt, defaultStyle, continuation );
                           continuation = true;
                           if (indent_pts > 0)
                              wr_body.ParagraphFormat.LeftIndent = indent_pts;

                           // flag list numbering restart if this is printable text.
                           if (tt.txt.Trim().Length > 0)
                              postFormats.Add( new postFormat_type(null, tt) );
                           break;

                        default:
                           break;
                     }
                     break;

                  default:
                     break;
               }
            }
         }

         // Now apply post formatting commands to text already serialised in previous loop
         int last_list_level = 0;
         foreach (postFormat_type pf in postFormats)
         {
            object style;

            // a null word range implies we must restart numbering for any lists
            if (pf.m_wr == null)
            {
               last_list_level = 0;
            }
            else
            {
               switch (pf.m_tk.styleName)
               {
                  case EA_Constants.styleName_Bold:
                     pf.m_wr.Select();
                     createWordDoc.WordApp.Selection.Range.Bold = 1;
                     last_list_level = 0;
                     break;

                  case EA_Constants.styleName_Italic:
                     pf.m_wr.Select();
                     createWordDoc.WordApp.Selection.Range.Italic = 1;
                     last_list_level = 0;
                     break;

                  case EA_Constants.styleName_Underline:
                     pf.m_wr.Select();
                     createWordDoc.WordApp.Selection.Range.Underline = Word.WdUnderline.wdUnderlineSingle;
                     last_list_level = 0;
                     break;

                  case EA_Constants.styleName_ListBullet0:
                  case EA_Constants.styleName_ListBullet1:
                  case EA_Constants.styleName_ListBullet2:
                  case EA_Constants.styleName_ListBullet3:
                  case EA_Constants.styleName_ListBullet4:
                  case EA_Constants.styleName_ListIndent0:
                  case EA_Constants.styleName_ListIndent1:
                  case EA_Constants.styleName_ListIndent2:
                  case EA_Constants.styleName_ListIndent3:
                  case EA_Constants.styleName_ListIndent4:
                  case EA_Constants.styleName_ListNumber0:
                  case EA_Constants.styleName_ListNumber1:
                  case EA_Constants.styleName_ListNumber2:
                  case EA_Constants.styleName_ListNumber3:
                  case EA_Constants.styleName_ListNumber4:
                  case EA_Constants.styleName_AlphaList0 :
                  case EA_Constants.styleName_AlphaList1 :
                  case EA_Constants.styleName_AlphaList2 :
                  case EA_Constants.styleName_AlphaList3 :
                  case EA_Constants.styleName_AlphaList4 :
                     style = pf.m_tk.styleName;
                     pf.m_wr.Select();
                     createWordDoc.WordApp.Selection.Range.set_Style(ref style);

                     //Main.WriteOutput(string.Format("last list level {0}, this list level {1}", last_list_level, pf.m_tk.gen1), -1);

                     // Figure out if we have to restart numbering
                     if (last_list_level < pf.m_tk.gen1)
                     {
                        Word.ListTemplate lt;
                        object continuePreviousList = false;
                        object applyTo = Word.WdListApplyTo.wdListApplyToWholeList;
                        object defListBehavour = Word.WdDefaultListBehavior.wdWord10ListBehavior;
                    
                        if ((pf.m_tk.styleName.IndexOf("Alpha") >= 0) || (pf.m_tk.styleName.IndexOf("Number") >= 0))
                        {
                           lt = createWordDoc.WordApp.Selection.Range.ListFormat.ListTemplate;
                           
                           createWordDoc.WordApp.Selection.Range.ListFormat.ApplyListTemplate(
                              lt, ref continuePreviousList, ref applyTo, ref defListBehavour);
                        }
                     }

                     // shift content right by relative indent adjustment we calculated earlier
                     if (relative_indent_adjustment > 0)
                     {
                        createWordDoc.WordApp.Selection.Range.ParagraphFormat.LeftIndent += relative_indent_adjustment;
                     }

                     last_list_level = pf.m_tk.gen1;
                     break;
                  default:
                     break;
               }
            }
         }
         return true;
      }

      



        }
}
Subversion Repositories DevTools

(root)/EA_DocGen/trunk/EA_DocGen/TextParser.cs – Rev 2126