1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
| from __future__ import annotations
import os
from io import BytesIO
import logging
logging.basicConfig(level=logging.DEBUG)
# This script hardcodes and generates a small PDF containing a variety of
# page-content operators (graphics, text, state) so you can step through
# pdfminer.six extract_pages() and observe how each operator is handled.
#
# Output: samples/debug_ops.pdf
#
# Key points to help debugging pdfminer internals:
# - The font is a Type0 + CIDFont with /Encoding /GB-EUC-H to trigger
# CMapDB.get_cmap('GB-EUC-H') and thus CMapDB._load_data when not cached.
# - The content stream includes operators: q/Q, cm, w, RG, rg, m, l, h, S,
# re, f, BT/ET, Tf, Td, Tm (implicit via Td), Tj, TJ.
# - Chinese text bytes <D6D0B9FA> represent "中国" in GB2312, so with
# /Encoding /GB-EUC-H pdfminer will consult the CMap.
def build_pdf_bytes() -> bytes:
bio = BytesIO()
def w(s: bytes) -> int:
pos = bio.tell()
bio.write(s)
return pos
# Collect objects then build xref.
objects: list[tuple[int, bytes]] = []
# 1. Catalog
objects.append(
(
1,
b"<< /Type /Catalog /Pages 2 0 R >>",
)
)
# 2. Pages
objects.append(
(
2,
b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
)
)
# 4. Type0 Font with GB-EUC-H encoding to trigger CMap loading
font_type0 = (
4,
(
b"<< /Type /Font /Subtype /Type0 "
b"/BaseFont /DebugCID "
# b"/Encoding /..#2F..#2F..#2F..#2F..#2F..#2F..#2F..#2Ftmp#2Fn4c1"
b"/Encoding /..#2F..#2F..#2F..#2F..#2F..#2F..#2F..#2F..#2F..#2F..#2Fapp#2Fuploads#2Fn4c1"
b"/DescendantFonts [6 0 R] >>"
),
)
objects.append(font_type0)
# 6. CIDFont descendant with GB1 registry
cidfont = (
6,
(
b"<< /Type /Font /Subtype /CIDFontType0 "
b"/BaseFont /DebugCID "
b"/CIDSystemInfo << /Registry (n4c1222) /Ordering (n4c1_2) /Supplement 5 >> "
b"/DW 1000 "
b"/FontDescriptor << "
b"/Type /FontDescriptor "
b"/FontName /DebugCID "
b"/FontBBox [0 -200 1000 900] " # 必须是四个数字
b"/Ascent 800 "
b"/Descent -200 "
b"/CapHeight 700 "
b"/Flags 32 "
b"/ItalicAngle 0 "
b"/StemV 80 "
b">>"
b">>"
),
)
objects.append(cidfont)
# 5. Page content stream — includes a variety of operators
# Note: GB2312 for 中国 is D6 D0 B9 FA
content_ops = b"\n".join(
[
b"q", # save graphics state
b"1 0 0 1 0 0 cm", # identity CTM (explicit)
b"0.75 w", # line width
b"0 0 1 RG", # stroke color = blue
b"1 0 0 rg", # fill color = red
b"100 600 m", # move to
b"200 650 l", # line to
b"300 600 l", # line to
b"h", # close path
b"S", # stroke
b"100 500 150 40 re", # rectangle
b"f", # fill
b"BT", # begin text
b"/F1 24 Tf", # font + size
b"100 700 Td", # move text position
b"<48656C6C6F20504446> Tj", # "Hello PDF"
b"0 -30 Td", # next line down
b"<D6D0B9FA> Tj", # "中国" in GB2312; mapped via GB-EUC-H
b"-50 -40 Td", # move
b"[(AB) -50 (<20>) 100 (CD)] TJ", # kerning array example
b"ET", # end text
b"Q", # restore graphics state
b"",
]
)
stream = b"stream\n" + content_ops + b"\nendstream"
content_len = len(stream) - len(b"stream\n") - len(b"\nendstream")
content_obj = (
5,
b"<< /Length %d >>\n" % content_len + stream,
)
objects.append(content_obj)
# 3. Page (references content and font resource)
page_dict = (
3,
(
b"<< /Type /Page /Parent 2 0 R "
b"/MediaBox [0 0 612 792] "
b"/Resources << /Font << /F1 4 0 R >> >> "
b"/Contents 5 0 R >>"
),
)
# Ensure ordering: add page after fonts and content so refs exist
objects.append(page_dict)
# Start writing file
w(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n")
offsets: dict[int, int] = {}
# Write each object, record offsets
for obj_id, obj_body in objects:
offsets[obj_id] = w(f"{obj_id} 0 obj\n".encode("ascii"))
w(obj_body)
w(b"\nendobj\n")
# xref
startxref = bio.tell()
max_obj = max(offsets) if offsets else 0
# xref table requires a free entry 0
w(b"xref\n")
w(f"0 {max_obj + 1}\n".encode("ascii"))
# object 0 free
w(b"0000000000 65535 f \n")
for i in range(1, max_obj + 1):
off = offsets.get(i, 0)
w(f"{off:010d} 00000 n \n".encode("ascii"))
# trailer
w(
(
b"trailer\n"
+ b"<< "
+ f"/Size {max_obj + 1} ".encode("ascii")
+ b"/Root 1 0 R "
+ b">>\n"
)
)
w(b"startxref\n")
w(f"{startxref}\n".encode("ascii"))
w(b"%%EOF\n")
return bio.getvalue()
def main() -> str:
out_dir = os.path.join(os.path.dirname(__file__), os.pardir, "samples")
out_dir = os.path.abspath(out_dir)
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "debug_ops.pdf")
data = build_pdf_bytes()
with open(out_path, "wb") as f:
f.write(data)
print("PDF written:", out_path)
print("Size:", len(data), "bytes")
return out_path
if __name__ == "__main__":
main()
|