please dont rip this site
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by Ti Lian Hwang <tilh@mars> on Thu Jul  6 15:14:10 1995
#
# This archive contains:
#	dummy.pl	h2a_pcl.pl	parse-html.pl	html-ascii.pl	
#

LANG=""; export LANG
PATH=/bin:/usr/bin:$PATH; export PATH

echo x - dummy.pl
cat >dummy.pl <<'@EOF'
# put your drivers in a file
# 'h2a_$TERM.pl' where $TERM is the environment variable
#
sub html_begin_doc {}

sub html_end_doc {}

sub begin_font {}

sub end_font {}

1;
@EOF

chmod 664 dummy.pl

echo x - h2a_pcl.pl
cat >h2a_pcl.pl <<'@EOF'
#
# HP-PCL font functions to be used by html-to-ascii.pl
# "required" by parse-html"
#
# Author : Ti Lian Hwang
# email  : tilh@sin-ro.sin-ro.dhl.com
#
#
# fixed spacing, 12 pitch, 12 points, Times Roman
$normal_font = "\033(s0p12h12v3T";

sub html_begin_doc {
	# reset, CR=CR,LF=LF+CR,FF=FF+CR
	print "\033E\033&k2G";
}

sub html_end_doc {
	print "\033E";
}

$Begin{"B"} = "begin_font";
$End{"B"} = "end_font";      

$Begin{"I"} = "begin_font";
$End{"I"} = "end_font";      

$Begin{"U"} = "begin_font";
$End{"U"} = "end_font";      

$Begin{"EM"} = "begin_font";
$End{"EM"} = "end_font";      

%font_begin = (
	"PRE","\033(s0p12h3T",
	"H1","\033(s1p20v5t3b",
	"H2","\033(s1p18v5t3b",
	"H3","\033(s1p16v5t3B",
	"H4","\033(s1p14v5t3B",
	"H5","\033(s1p12v5t3B",
	"H6","\033(s1p12v5t3B",
	"B","\033(s3B",
	"U","\033&dD",
	"I","\033(s1S",
	"EM","\033(s3B"
);

%font_end = (
	"PRE",$normal_font,
	"H1",$normal_font . "\033(s0B",
	"H2",$normal_font . "\033(s0B",
	"H3",$normal_font . "\033(s0B",
	"H4",$normal_font . "\033(s0B",
	"H5",$normal_font . "\033(s0B",
	"H6",$normal_font . "\033(s0B",
	"B","\033(s0B",
	"U","\033&d@",
	"I","\033(s0S",
	"EM","\033(s0B"
);

sub begin_font {
local ($element, $tag) = @_;
print $font_begin{$element};
}

sub end_font {
local ($element, $tag) = @_;
print $font_end{$element};
}

1;
@EOF

chmod 664 h2a_pcl.pl

echo x - parse-html.pl
sed 's/^@//' >parse-html.pl <<'@EOF'
# HTML parser
# Jim Davis, July 15 1994

# This is an HTML parser not an SGML parser.  It does not parse a DTD,
# The DTD is implicit in the code, and specific to HTML.  
# The processing of the HTML can be customized by the user by
#   1) Defining routines to be called for various tags (see Begin and End arrays)
#   2) Defining routines html_content and html_whitespace

# This is not a validating parser.   It does not check the content model
# eg you can use DT outside a DL and it won't know.  It is too liberal in
# what tags are allowed to minimize what other tags.

# Bugs - can't parse the prolog or whatever you call it
#   <!DOCTYPE HTML [
#     <!entity % HTML.Minimal "INCLUDE">
#     <!-- Include standard HTML DTD -->
#     <!ENTITY % html PUBLIC "-//connolly hal.com//DTD WWW HTML 1.8//EN">
#     %html;
#     ]>

# modified 3 Aug to add a bunch of HTML 2.0 tags
# modified 3 Sept to print HTML stack to STDERR not STDOUT, to add new
#  routines html_begin_doc and html_end_doc for application specific cleanup
#  and to break parse_html into two pieces.
# modified 30 Sept 94.  parse_attributes now handles tag attributes that
#   don't have values.  thanks to  Bill Simpson-Young <bill@syd.dit.csiro.au>
#   for the code.
# modified 17 Apr 95 to support FORMS tags.
#
# modified 6 July 1995 by Ti Lian Hwang <tilh@sin-co.sin-ro.dhl.com>
#   to handle 'printer drivers' - files to be 'require' depending on $TERM
#   files are to have name of 'h2a_$TERM.pl'
#

$filename = "h2a_" . $ENV{"TERM"} . ".pl";
foreach $prefix (@INC) {                
    $realfilename = "$prefix/$filename";
    if (-f $realfilename) { 
	require "$filename" ;
	$filename = "";
	last;
    }
}
if ($filename) {require "dummy.pl" ; }

$debug = 0;

$whitespace_significant = 0;

# global variables: 
#  $line_buffer is line buffer
#  $line_count is input line number.

$line_buffer = "";
$line_count = 0;

sub parse_html {
    local ($file) = @_;
    open (HTML, $file) || die "Could not open $file: $!\nStopped";
    &parse_html_stream ();
    close (HTML);}

# Global input HTML is the handle to the stream of HTML
sub parse_html_stream {
    local ($token, $new);

    ## initialization
    @stack=();
    $line_count = 0;
    $line_buffer = "";

    ## application specific initialization
    &html_begin_doc();

  main:
    while (1) {

	# if whitespace does not matter, trim any leading space.
	if (! $whitespace_significant) {
	    $line_buffer =~ s/^\s+//;}

	# now dispatch on the type of token

	if ($line_buffer =~ /^(\s+)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_whitespace ($token);}

	# This will lose if there is more than one comment on the line!
	elsif ($line_buffer =~ /^(\<!--.*-->)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_comment ($token);}

	elsif ($line_buffer =~ /^(\<![^-][^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_comment ($token);}

	elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_etag ($token);}

	elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_tag ($token);}

	elsif ($line_buffer =~ /^([^\s<]+)/) {
	    $token = $1;
	    $line_buffer = $';
	    $token = &substitute_entities($token);
	    &html_content ($token); }

	else {
	    # No valid token in buffer.  Maybe it's empty, or maybe there's an
	    # incomplete tag.  So get some more data.
	    $new = <HTML>;
	    if (! defined ($new)) {last main;}
	    # if we're trying to find a match for a tag, then get rid of embedded newline
	    # this is, I think, a kludge
	    if ($line_buffer =~ /^\</ && $line_buffer =~ /\n$/) {
		chop $line_buffer;
		$line_buffer .= " ";}
	    $line_buffer .= $new;
	    $line_count++;}
    }

    ## cleanup
    &html_end_doc();

    if ($#stack > -1) {
	print STDERR "Stack not empty at end of document\n";
	&print_html_stack();}
}


sub html_tag {
    local ($tag) = @_;
    local ($element) = &tag_element ($tag);
    local (%attributes) = &tag_attributes ($tag);

    # the tag might minimize (be an implicit end) for the previous tag
    local ($prev_element);
    while (&Minimizes(&stack_top_element(), $element)) {
	$prev_element = &stack_pop_element ();
	if ($debug)  {
	    print STDERR "MINIMIZING $prev_element with $element on $line_count\n";}
	&html_end ($prev_element, 0);}

    push (@stack, $tag);

    &html_begin ($element, $tag, *attributes);

    if (&Empty($element)) {
	pop(@stack);
	&html_end ($element, 0);}
}

sub html_etag {
    local ($tag) = @_;
    local ($element) = &tag_element ($tag);

    # pop stack until find matching tag.  This is probably a bad idea,
    # or at least too general.
    local ( $prev_element) = &stack_pop_element();
    until ($prev_element eq $element) {
	if ($debug) {
	    print STDERR "MINIMIZING $prev_element with /$element on $line_count \n";}
	&html_end ($prev_element, 0);

	if ($#stack == -1) {
	    print STDERR "No match found for /$element.  You will lose\n";
	    last;}
	$prev_element = &stack_pop_element();}

    &html_end ($element, 1);
}


# For each element, the names of elements which minimize it.
# This is of course totally HTML dependent and probably I have it wrong too
$Minimize{"DT"} = "DT:DD";
$Minimize{"DD"} = "DT";
$Minimize{"LI"} = "LI";
$Minimize{"P"} = "P:DT:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL:DL";

# Does element E2 minimize E1?
sub Minimizes {
    local ($e1, $e2) = @_;
    local ($value) = 0;
    foreach $elt (split (":", $Minimize{$e1})) {
	if ($elt eq $e2) {$value = 1;}}
    $value;}

$Empty{"BASE"} = 1;
$Empty{"BR"} = 1;
$Empty{"HR"} = 1;
$Empty{"IMG"} = 1;
$Empty{"ISINDEX"} = 1;
$Empty{"LINK"} = 1;
$Empty{"META"} = 1;
$Empty{"NEXTID"} = 1;
$Empty{"INPUT"} = 1;

# Empty tags have no content and hence no end tags
sub Empty {
    local ($element) = @_;
    $Empty{$element};}


sub print_html_stack {
    print STDERR "\n  ==\n";
    foreach $elt (reverse @stack) {print STDERR "  $elt\n";}
    print STDERR "  ==========\n";}

# The element on top of stack, if any.
sub stack_top_element {
    if ($#stack >= 0) {	&tag_element ($stack[$#stack]);}}

sub stack_pop_element {
    &tag_element (pop (@stack));}

# The element from the tag, normalized.
sub tag_element {
    local ($tag) = @_;
    $tag =~ /<\/?([^\s>]+)/;
    local ($element) = $1;
    $element =~ tr/a-z/A-Z/;
    $element;}

# associative array of the attributes of a tag.
sub tag_attributes {
    local ($tag) = @_;
    $tag =~ /^<[A-Za-z]+ +(.*)>$/;
    &parse_attributes($1);}

# string should be something like
# KEY="value" KEY2="longer value"  KEY3="tags o doom"
# output is an associative array (like a lisp property list)
# attributes names are not case sensitive, do I downcase them
# Maybe (probably) I should substitute for entities when parsing attributes.

sub parse_attributes {
    local ($string) = @_;
    local (%attributes);
    local ($name, $val);
  get: while (1) {
      if ($string =~ /^ *([A-Za-z]+)=\"([^\"]*)\"/) {
	  $name = $1;
	  $val = $2;
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val; }
      elsif ($string =~ /^ *([A-Za-z]+)=(\S*)/) {
	  $name = $1;
	  $val = $2;
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val;}
      elsif ($string =~ /^ *([A-Za-z]+)/) {
	  $name = $1;
	  $val = "";
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val;}
      else {last;}}
    %attributes;}

sub substitute_entities {
    local ($string) = @_;
    $string =~ s/&amp;/&/g;
    $string =~ s/&lt;/</g;
    $string =~ s/&gt;/>/g;
    $string =~ s/&quot;/\"/g;
    $string;}


@@HTML_elements = (
		  "A",
		  "ADDRESS",
		  "B",
		  "BASE",
		  "BLINK",	#  Netscape addition :-(
		  "BLOCKQUOTE",
		  "BODY",
		  "BR",
		  "CITE",
		  "CENTER",	# Netscape addition :-(
		  "CODE",
		  "DD",
		  "DIR",
		  "DFN",
		  "DL",
		  "DT",
		  "EM",
		  "FORM",
		  "H1", "H2", "H3", "H4", "H5", "H6",
		  "HEAD",
		  "HR",
		  "HTML",
		  "I",
		  "ISINDEX",
		  "IMG",
		  "INPUT",
		  "KBD",
		  "LI",
		  "LINK",
		  "MENU",
		  "META",
		  "NEXTID",
		  "OL",
		  "OPTION",
		  "P",
		  "PRE",
		  "SAMP",
		  "SELECT",
		  "STRIKE",
		  "STRONG",
		  "TITLE",
		  "TEXTAREA",
		  "TT",
		  "U",
		  "UL",
		  "VAR",
		  );

sub define_element {
    local ($element) = @_;
    $Begin{$element} = "Noop";
    $End{$element} = "Noop";}

foreach $element (@HTML_elements) {&define_element($element);}

# do nothing
sub Noop {
    local ($element, $xxx) = @_;}

# called when a tag begins.  Dispatches using Begin
sub html_begin {
    local ($element, $tag, *attributes) = @_;

    local ($routine) = $Begin{$element};
    if ($routine eq "") {
	print STDERR "Unknown HTML element $element ($tag) on line $line_count\n";}
    else {
	&begin_font ($element, $explicit);
	eval "&$routine;"}
}

# called when a tag ends.  Explicit is 0 if tag end is because of minimization
# not that you should care.
sub html_end {
    local ($element, $explicit) = @_;
    local ($routine) = $End{$element};
    if ($routine eq "") {
	print STDERR "Unknown HTML element \"$element\" (END $explicit) on line $line_count\n";}
    else {
	eval "&$routine(\"$element\", $explicit)";
	&end_font ($element, $explicit);
    }
}

sub html_content {
    local ($word) = @_;
}

sub html_whitespace {
    local ($whitespace) = @_;}

sub html_comment {
    local ($tag) = @_;}

# redefine these for application-specific initialization and cleanup

# sub html_begin_doc {}

# sub html_end_doc {}

# return a "true value" when loaded by perl.
1;

@EOF

chmod 644 parse-html.pl

echo x - html-ascii.pl
sed 's/^@//' >html-ascii.pl <<'@EOF'
# Routines for HTML to ASCII.
# (fixed width font, no font changes for size, bold, etc) with a little

# BUGS AND MISSING FEATURES
#  font tags (e.g. CODE, EM) cause an extra whitespace 
#   e.g. <TT>foo</TT>, -> foo ,

# Jim Davis July 15 1994
# modified 3 Aug 94 to support MENU and DIR

require "tformat.pl" || die "Could not load tformat.pl: $@\nStopped";

# Can be set by command line arg
if (! defined($columns_per_line)) {
    $columns_per_line = 72;}

if (! defined($flush_last_page)) {
    $flush_last_page = 1;}

# amount to add to indentation
$indent_left = 5;
$indent_right = 5;

# ignore contents inside HEAD.
$ignore_text = 0;

# Set variables in tformat
$left_margin = 1;
$right_margin = $columns_per_line;
$bottom_margin = 0;

## Routines called by html.pl
$Begin{"HEAD"} = "begin_head";
$End{"HEAD"} = "end_head";

sub begin_head {
    local ($element, $tag) = @_;
    $ignore_text = 1;}

sub end_head {
    local ($element) = @_;
    $ignore_text = 0;}

$Begin{"BODY"} = "begin_document";

sub begin_document {
    local ($element, $tag) = @_;
    &start_page();}

$End{"BODY"} = "end_document";

sub end_document {
    local ($element) = @_;
    &fresh_line();}

## Headers

$Begin{"H1"} = "begin_header";
$End{"H1"} = "end_header";

$Begin{"H2"} = "begin_header";
$End{"H2"} = "end_header";

$Begin{"H3"} = "begin_header";
$End{"H3"} = "end_header";

$Begin{"H4"} = "begin_header";
$End{"H4"} = "end_header";

$Skip_Before{"H1"} = 1;
$Skip_After{"H1"} = 1;

$Skip_Before{"H2"} = 1;
$Skip_After{"H2"} = 1;

$Skip_Before{"H3"} = 1;
$Skip_After{"H3"} = 0;

sub begin_header {
    local ($element, $tag) = @_;
    &skip_n_lines ($Skip_Before{$element}, 5);}

sub end_header {
    local ($element) = @_;
    &skip_n_lines ($Skip_After{$element});}

$Begin{"BR"} = "line_break";

sub line_break {
    local ($element, $tag) = @_;
    &fresh_line();}

$Begin{"P"} = "begin_paragraph";

# if fewer than this many lines left on page, start new page
$widow_cutoff = 5;

sub begin_paragraph {
    local ($element, $tag) = @_;
    &skip_n_lines (1, $widow_cutoff);}

$Begin{"BLOCKQUOTE"} = "begin_blockquote";
$End{"BLOCKQUOTE"} = "end_blockquote";

sub begin_blockquote {
    local ($element, $tag) = @_;
    $left_margin += $indent_left;
    $right_margin = $columns_per_line - $indent_right;
    &skip_n_lines (1);}

sub end_blockquote {
    local ($element) = @_;
    $left_margin -= $indent_left;
    $right_margin = $columns_per_line;
    &skip_n_lines (1);}

$Begin{"PRE"} = "begin_pre";
$End{"PRE"} = "end_pre";

sub begin_pre {
    local ($element, $tag) = @_;
    $whitespace_significant = 1;}

sub end_pre {
    local ($element) = @_;
    $whitespace_significant = 0;}

$Begin{"INPUT"} = "form_input";

sub form_input {
    local ($element, $tag, *attributes) = @_;
    if ($attributes{"value"} ne "") {
	&print_word_wrap($attributes{"value"});}}

$Begin{"HR"} = "horizontal_rule";

sub horizontal_rule {
    local ($element, $tag) = @_;
    &fresh_line ();
    &print_n_chars ($right_margin - $left_margin, "-");}

# Add code for IMG (use ALT attribute)
# Ignore I, B, EM, TT, CODE (no font changes)

## List environments

$Begin{"UL"} = "begin_itemize";
$End{"UL"} = "end_list_env";

$Begin{"OL"} = "begin_enumerated";
$End{"OL"} = "end_list_env";

$Begin{"MENU"} = "begin_menu";
$End{"MENU"} = "end_list_env";

$Begin{"DIR"} = "begin_dir";
$End{"DIR"} = "end_list_env";

$Begin{"LI"} = "begin_list_item";

@@list_stack = ();
$list_type = "bullet";
$list_counter = 0;

sub push_list_env {
    push (@list_stack, join (":", $list_type, $list_counter));}

sub pop_list_env {
    ($list_type, $list_counter) = split (":", pop (@list_stack));
    $left_margin -= $indent_left;}

sub begin_itemize {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_menu {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_dir {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_enumerated {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "enumerated";
    $list_counter = 1;}

sub end_list_env {
    local ($element) = @_;
    &pop_list_env();
#    &fresh_line();
}

sub begin_list_item {
    local ($element, $tag) = @_;
    $left_margin -= 2;
    &fresh_line();
    &print_word_wrap("$list_counter ");
    if ($list_type eq "enumerated") {$list_counter++;}
    $left_margin += 2;}

$Begin{"DL"} = "begin_dl";

sub begin_dl {
    local ($element, $tag) = @_;
    &skip_n_lines(1,5);}
    
$Begin{"DT"} = "begin_defined_term";
$Begin{"DD"} = "begin_defined_definition";
$End{"DD"} = "end_defined_definition";

sub begin_defined_term {
    local ($element, $tag) = @_;
    &fresh_line();}

sub begin_defined_definition {
    local ($element, $tag) = @_;
    $left_margin += $indent_left;
    &fresh_line();}

sub end_defined_definition {
    local ($element) = @_;
    $left_margin -= $indent_left;
    &fresh_line();}

$Begin{"META"} = "begin_meta";

# a META tag sets a value in the assoc array %Variable
# i.e. <META name="author" content="Rushdie"> sers $Variable{author} to "Rushdie"
sub begin_meta {
    local ($element, $tag, *attributes) = @_;
    local ($variable, $value);
    $variable = $attributes{name};
    $value = $attributes{content};
    $Variable{$variable} = $value;}

$Begin{"IMG"} = "begin_img";

sub begin_img {
    local ($element, $tag, *attributes) = @_;
    &print_word_wrap (($attributes{"alt"} ne "") ? $attributes{"alt"} : "[IMAGE]");}

# Content and whitespace.

sub html_content {
    local ($string) = @_;
    unless ($ignore_text) {
	&print_word_wrap ($string);}}

sub html_whitespace {
    local ($string) = @_;
    if (! $whitespace_significant) {
	die "Internal error, called html_whitespace when whitespace was not significant";}
    local ($i);
    for ($i = 0; $i < length ($string); $i++) {
	&print_whitespace (substr($string,$i,1));}}

# called by tformat.  Do nothing.
sub do_footer {
}

sub do_header {
}


1;
@EOF

chmod 644 html-ascii.pl

exit 0





file: /Techref/language/pcl/html2pcl.pl.htm, 18KB, , updated: 2001/3/30 08:53, local time: 2024/11/22 08:35,
TOP NEW HELP FIND: 
13.58.161.115:LOG IN

 ©2024 These pages are served without commercial sponsorship. (No popup ads, etc...).Bandwidth abuse increases hosting cost forcing sponsorship or shutdown. This server aggressively defends against automated copying for any reason including offline viewing, duplication, etc... Please respect this requirement and DO NOT RIP THIS SITE. Questions?
Please DO link to this page! Digg it! / MAKE!

<A HREF="http://techref.massmind.org/Techref/language/pcl/html2pcl.pl.htm"> language pcl html2pcl</A>

After you find an appropriate page, you are invited to your to this massmind site! (posts will be visible only to you before review) Just type a nice message (short messages are blocked as spam) in the box and press the Post button. (HTML welcomed, but not the <A tag: Instead, use the link box to link to another page. A tutorial is available Members can login to post directly, become page editors, and be credited for their posts.


Link? Put it here: 
if you want a response, please enter your email address: 
Attn spammers: All posts are reviewed before being made visible to anyone other than the poster.
Did you find what you needed?

 

Welcome to massmind.org!

 

Welcome to techref.massmind.org!

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  .