www.webdeveloper.com
Results 1 to 2 of 2

Thread: Extract data from .rtf file

Hybrid View

  1. #1
    Join Date
    Jul 2013
    Posts
    18

    Exclamation Extract data from .rtf file

    This function extract .rth file data but the problem is that it's reading some information are exist but not shown in the file , but php could read it

    Now i need to extract shown data only



    Code:
    function rtf2text($filename) {
    
        $text = file_get_contents($filename);
        if (!strlen($text))
            return "";
    
        $document = "";
        $stack = array();
        $j = -1;
    
        // Read the data character-by- character…
        for ($i = 0, $len = strlen($text); $i < $len; $i++) {
            $c = $text[$i];
    
            // Depending on current character select the further actions.
            switch ($c) {
                // the most important key word backslash
                case "\\":
                    // read next character
                    $nc = $text[$i + 1];
    
                    // If it is another backslash or nonbreaking space or hyphen,
                    // then the character is plain text and add it to the output stream.
                    if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
                    elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
                    elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
                    // If it is an asterisk mark, add it to the stack.
                    elseif ($nc == '*') $stack[$j]["*"] = true;
                    // If it is a single quote, read next two characters that are the hexadecimal notation
                    // of a character we should add to the output stream.
                    elseif ($nc == "'") {
                        $hex = substr($text, $i + 2, 2);
                        if (rtf_isPlainText($stack[$j]))
                            $document .= html_entity_decode("&#".hexdec($hex).";");
                        //Shift the pointer.
                        $i += 2;
                    // Since, we’ve found the alphabetic character, the next characters are control word
                    // and, possibly, some digit parameter.
                    } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                        $word = "";
                        $param = null;
    
                        // Start reading characters after the backslash.
                        for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
                            $nc = $text[$k];
                            // If the current character is a letter and there were no digits before it,
                            // then we’re still reading the control word. If there were digits, we should stop
                            // since we reach the end of the control word.
                            if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                                if (empty($param))
                                    $word .= $nc;
                                else
                                    break;
                            // If it is a digit, store the parameter.
                            } elseif ($nc >= '0' && $nc <= '9')
                                $param .= $nc;
                            // Since minus sign may occur only before a digit parameter, check whether
                            // $param is empty. Otherwise, we reach the end of the control word.
                            elseif ($nc == '-') {
                                if (empty($param))
                                    $param .= $nc;
                                else
                                    break;
                            } else
                                break;
                        }
                        // Shift the pointer on the number of read characters.
                        $i += $m - 1;
    
                        // Start analyzing what we’ve read. We are interested mostly in control words.
                        $toText = "";
                        switch (strtolower($word)) {
                            // If the control word is "u", then its parameter is the decimal notation of the
                            // Unicode character that should be added to the output stream.
                            // We need to check whether the stack contains \ucN control word. If it does,
                            // we should remove the N characters from the output stream.
                            case "u":
                                $toText .= html_entity_decode("&#x".dechex($param).";");
                                $ucDelta = @$stack[$j]["uc"];
                                if ($ucDelta > 0)
                                    $i += $ucDelta;
                            break;
                            // Select line feeds, spaces and tabs.
                            case "par": case "page": case "column": case "line": case "lbr":
                                $toText .= "\n"; 
                            break;
                            case "emspace": case "enspace": case "qmspace":
                                $toText .= " "; 
                            break;
                            case "tab": $toText .= "\t"; break;
                            // Add current date and time instead of corresponding labels.
                            case "chdate": $toText .= date("m.d.Y"); break;
                            case "chdpl": $toText .= date("l, j F Y"); break;
                            case "chdpa": $toText .= date("D, j M Y"); break;
                            case "chtime": $toText .= date("H:i:s"); break;
                            // Replace some reserved characters to their html analogs.
                            case "emdash": $toText .= html_entity_decode("&mdash;"); break;
                            case "endash": $toText .= html_entity_decode("&ndash;"); break;
                            case "bullet": $toText .= html_entity_decode("•"); break;
                            case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
                            case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
                            case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
                            case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
                            // Add all other to the control words stack. If a control word
                            // does not include parameters, set &param to true.
                            default:
                                $stack[$j][strtolower($word)] = empty($param) ? true : $param;
                            break;
                        }
                        // Add data to the output stream if required.
                        if (rtf_isPlainText($stack[$j]))
                            $document .= $toText;
                    }
    
                    $i++;
                break;
                // If we read the opening brace {, then new subgroup starts and we add
                // new array stack element and write the data from previous stack element to it.
                case "{":
                    array_push($stack, $stack[$j++]);
                break;
                // If we read the closing brace }, then we reach the end of subgroup and should remove 
                // the last stack element.
                case "}":
                    array_pop($stack);
                    $j--;
                break;
                // Skip “trash”.
                case '\0': case '\r': case '\f': case '\n': break;
                // Add other data to the output stream if required.
                default:
                    if (rtf_isPlainText($stack[$j]))
                        $document .= $c;
                break;
            }
    
        }
    
        return $document;
    }

  2. #2
    Join Date
    Jul 2013
    Posts
    18
    I don't know how to ???

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Tags for this Thread

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
HTML5 Development Center



Recent Articles