Ruby 4.0.5p0 (2026-05-20 revision 64336ffd0ee9e1f4c05891695a3d7b49cb709721)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149static inline bool
150str_encindex_fastpath(int encindex)
151{
152 // The overwhelming majority of strings are in one of these 3 encodings.
153 switch (encindex) {
154 case ENCINDEX_ASCII_8BIT:
155 case ENCINDEX_UTF_8:
156 case ENCINDEX_US_ASCII:
157 return true;
158 default:
159 return false;
160 }
161}
162
163static inline bool
164str_enc_fastpath(VALUE str)
165{
166 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
167}
168
169#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
170#define TERM_FILL(ptr, termlen) do {\
171 char *const term_fill_ptr = (ptr);\
172 const int term_fill_len = (termlen);\
173 *term_fill_ptr = '\0';\
174 if (UNLIKELY(term_fill_len > 1))\
175 memset(term_fill_ptr, 0, term_fill_len);\
176} while (0)
177
178#define RESIZE_CAPA(str,capacity) do {\
179 const int termlen = TERM_LEN(str);\
180 RESIZE_CAPA_TERM(str,capacity,termlen);\
181} while (0)
182#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
183 if (STR_EMBED_P(str)) {\
184 if (str_embed_capa(str) < capacity + termlen) {\
185 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
186 const long tlen = RSTRING_LEN(str);\
187 memcpy(tmp, RSTRING_PTR(str), tlen);\
188 RSTRING(str)->as.heap.ptr = tmp;\
189 RSTRING(str)->len = tlen;\
190 STR_SET_NOEMBED(str);\
191 RSTRING(str)->as.heap.aux.capa = (capacity);\
192 }\
193 }\
194 else {\
195 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
196 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
197 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
198 RSTRING(str)->as.heap.aux.capa = (capacity);\
199 }\
200} while (0)
201
202#define STR_SET_SHARED(str, shared_str) do { \
203 if (!FL_TEST(str, STR_FAKESTR)) { \
204 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
205 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
206 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
207 FL_SET((str), STR_SHARED); \
208 FL_SET((shared_str), STR_SHARED_ROOT); \
209 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
210 FL_SET_RAW((shared_str), STR_BORROWED); \
211 } \
212} while (0)
213
214#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
215#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216/* TODO: include the terminator size in capa. */
217
218#define STR_ENC_GET(str) get_encoding(str)
219
220#if !defined SHARABLE_MIDDLE_SUBSTRING
221# define SHARABLE_MIDDLE_SUBSTRING 0
222#endif
223#if !SHARABLE_MIDDLE_SUBSTRING
224#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
225#else
226#define SHARABLE_SUBSTRING_P(beg, len, end) 1
227#endif
228
229
230static inline long
231str_embed_capa(VALUE str)
232{
233 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
234}
235
236bool
237rb_str_reembeddable_p(VALUE str)
238{
239 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
240}
241
242static inline size_t
243rb_str_embed_size(long capa, long termlen)
244{
245 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
246 if (size < sizeof(struct RString)) size = sizeof(struct RString);
247 return size;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254 if (STR_EMBED_P(str)) {
255 size_t capa = RSTRING(str)->len;
256 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
257
258 real_size = rb_str_embed_size(capa, TERM_LEN(str));
259 }
260 /* if the string is not currently embedded, but it can be embedded, how
261 * much space would it require */
262 else if (rb_str_reembeddable_p(str)) {
263 size_t capa = RSTRING(str)->as.heap.aux.capa;
264 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
265
266 real_size = rb_str_embed_size(capa, TERM_LEN(str));
267 }
268 else {
269 real_size = sizeof(struct RString);
270 }
271
272 return real_size;
273}
274
275static inline bool
276STR_EMBEDDABLE_P(long len, long termlen)
277{
278 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
279}
280
281static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
282static VALUE str_new_frozen(VALUE klass, VALUE orig);
283static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
284static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
285static VALUE str_new(VALUE klass, const char *ptr, long len);
286static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
287static inline void str_modifiable(VALUE str);
288static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
289static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->len;
317
318 STR_SET_EMBED(str);
319 STR_SET_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_debug_rstring_null_ptr(const char *func)
331{
332 fprintf(stderr, "%s is returning NULL!! "
333 "SIGSEGV is highly expected to follow immediately.\n"
334 "If you could reproduce, attach your debugger here, "
335 "and look at the passed string.\n",
336 func);
337}
338
339/* symbols for [up|down|swap]case/capitalize options */
340static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
341
342static rb_encoding *
343get_encoding(VALUE str)
344{
345 return rb_enc_from_index(ENCODING_GET(str));
346}
347
348static void
349mustnot_broken(VALUE str)
350{
351 if (is_broken_string(str)) {
352 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
353 }
354}
355
356static void
357mustnot_wchar(VALUE str)
358{
359 rb_encoding *enc = STR_ENC_GET(str);
360 if (rb_enc_mbminlen(enc) > 1) {
361 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
362 }
363}
364
365static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
366
367#if SIZEOF_LONG == SIZEOF_VOIDP
368#define PRECOMPUTED_FAKESTR_HASH 1
369#else
370#endif
371
372static inline bool
373BARE_STRING_P(VALUE str)
374{
375 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
376}
377
378static inline st_index_t
379str_do_hash(VALUE str)
380{
381 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
382 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
383 if (e && !is_ascii_string(str)) {
384 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
385 }
386 return h;
387}
388
389static VALUE
390str_store_precomputed_hash(VALUE str, st_index_t hash)
391{
392 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
393 RUBY_ASSERT(STR_EMBED_P(str));
394
395#if RUBY_DEBUG
396 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
397 size_t free_bytes = str_embed_capa(str) - used_bytes;
398 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
399#endif
400
401 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
402
403 FL_SET(str, STR_PRECOMPUTED_HASH);
404
405 return str;
406}
407
408VALUE
409rb_fstring(VALUE str)
410{
411 VALUE fstr;
412 int bare;
413
414 Check_Type(str, T_STRING);
415
416 if (FL_TEST(str, RSTRING_FSTR))
417 return str;
418
419 bare = BARE_STRING_P(str);
420 if (!bare) {
421 if (STR_EMBED_P(str)) {
422 OBJ_FREEZE(str);
423 return str;
424 }
425
426 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
428 return str;
429 }
430 }
431
432 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
433 rb_str_resize(str, RSTRING_LEN(str));
434
435 fstr = register_fstring(str, false, false);
436
437 if (!bare) {
438 str_replace_shared_without_enc(str, fstr);
439 OBJ_FREEZE(str);
440 return str;
441 }
442 return fstr;
443}
444
445static VALUE fstring_table_obj;
446
447static VALUE
448fstring_concurrent_set_hash(VALUE str)
449{
450#ifdef PRECOMPUTED_FAKESTR_HASH
451 st_index_t h;
452 if (FL_TEST_RAW(str, STR_FAKESTR)) {
453 // register_fstring precomputes the hash and stores it in capa for fake strings
454 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
455 }
456 else {
457 h = rb_str_hash(str);
458 }
459 // rb_str_hash doesn't include the encoding for ascii only strings, so
460 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
461 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
462#else
463 return (VALUE)rb_str_hash(str);
464#endif
465}
466
467static bool
468fstring_concurrent_set_cmp(VALUE a, VALUE b)
469{
470 long alen, blen;
471 const char *aptr, *bptr;
472
475
476 RSTRING_GETMEM(a, aptr, alen);
477 RSTRING_GETMEM(b, bptr, blen);
478 return (alen == blen &&
479 ENCODING_GET(a) == ENCODING_GET(b) &&
480 memcmp(aptr, bptr, alen) == 0);
481}
482
484 bool copy;
485 bool force_precompute_hash;
486};
487
488static VALUE
489fstring_concurrent_set_create(VALUE str, void *data)
490{
491 struct fstr_create_arg *arg = data;
492
493 // Unless the string is empty or binary, its coderange has been precomputed.
494 int coderange = ENC_CODERANGE(str);
495
496 if (FL_TEST_RAW(str, STR_FAKESTR)) {
497 if (arg->copy) {
498 VALUE new_str;
499 long len = RSTRING_LEN(str);
500 long capa = len + sizeof(st_index_t);
501 int term_len = TERM_LEN(str);
502
503 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
504 new_str = str_alloc_embed(rb_cString, capa + term_len);
505 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
506 STR_SET_LEN(new_str, RSTRING_LEN(str));
507 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
508 rb_enc_copy(new_str, str);
509 str_store_precomputed_hash(new_str, str_do_hash(str));
510 }
511 else {
512 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
513 rb_enc_copy(new_str, str);
514#ifdef PRECOMPUTED_FAKESTR_HASH
515 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
516 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
517 }
518#endif
519 }
520 str = new_str;
521 }
522 else {
523 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
524 RSTRING(str)->len,
525 ENCODING_GET(str));
526 }
527 OBJ_FREEZE(str);
528 }
529 else {
530 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
531 str = str_new_frozen(rb_cString, str);
532 }
533 if (STR_SHARED_P(str)) { /* str should not be shared */
534 /* shared substring */
535 str_make_independent(str);
537 }
538 if (!BARE_STRING_P(str)) {
539 str = str_new_frozen(rb_cString, str);
540 }
541 }
542
543 ENC_CODERANGE_SET(str, coderange);
544 RBASIC(str)->flags |= RSTRING_FSTR;
545 if (!RB_OBJ_SHAREABLE_P(str)) {
546 RB_OBJ_SET_SHAREABLE(str);
547 }
548 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
551 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
552 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
554 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
555
556 return str;
557}
558
559static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
560 .hash = fstring_concurrent_set_hash,
561 .cmp = fstring_concurrent_set_cmp,
562 .create = fstring_concurrent_set_create,
563 .free = NULL,
564};
565
566void
567Init_fstring_table(void)
568{
569 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
570 rb_gc_register_address(&fstring_table_obj);
571}
572
573static VALUE
574register_fstring(VALUE str, bool copy, bool force_precompute_hash)
575{
576 struct fstr_create_arg args = {
577 .copy = copy,
578 .force_precompute_hash = force_precompute_hash
579 };
580
581#if SIZEOF_VOIDP == SIZEOF_LONG
582 if (FL_TEST_RAW(str, STR_FAKESTR)) {
583 // if the string hasn't been interned, we'll need the hash twice, so we
584 // compute it once and store it in capa
585 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
586 }
587#endif
588
589 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
590
591 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
593 RUBY_ASSERT(OBJ_FROZEN(result));
595 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
596 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
598
599 return result;
600}
601
602bool
603rb_obj_is_fstring_table(VALUE obj)
604{
605 ASSERT_vm_locking();
606
607 return obj == fstring_table_obj;
608}
609
610void
611rb_gc_free_fstring(VALUE obj)
612{
613 ASSERT_vm_locking_with_barrier();
614
615 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
617 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
618
619 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
620
621 RB_DEBUG_COUNTER_INC(obj_str_fstr);
622
623 FL_UNSET(obj, RSTRING_FSTR);
624}
625
626void
627rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
628{
629 if (fstring_table_obj) {
630 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
631 }
632}
633
634static VALUE
635setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
636{
637 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
638 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
639
640 if (!name) {
642 name = "";
643 }
644
645 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
646
647 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
648 fake_str->len = len;
649 fake_str->as.heap.ptr = (char *)name;
650 fake_str->as.heap.aux.capa = len;
651 return (VALUE)fake_str;
652}
653
654/*
655 * set up a fake string which refers a static string literal.
656 */
657VALUE
658rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
659{
660 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
661}
662
663/*
664 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
665 * shared string which refers a static string literal. `ptr` must
666 * point a constant string.
667 */
668VALUE
669rb_fstring_new(const char *ptr, long len)
670{
671 struct RString fake_str = {RBASIC_INIT};
672 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
673}
674
675VALUE
676rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
677{
678 struct RString fake_str = {RBASIC_INIT};
679 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
680}
681
682VALUE
683rb_fstring_cstr(const char *ptr)
684{
685 return rb_fstring_new(ptr, strlen(ptr));
686}
687
688static inline bool
689single_byte_optimizable(VALUE str)
690{
691 int encindex = ENCODING_GET(str);
692 switch (encindex) {
693 case ENCINDEX_ASCII_8BIT:
694 case ENCINDEX_US_ASCII:
695 return true;
696 case ENCINDEX_UTF_8:
697 // For UTF-8 it's worth scanning the string coderange when unknown.
699 }
700 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
701 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
702 return true;
703 }
704
705 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
706 return true;
707 }
708
709 /* Conservative. Possibly single byte.
710 * "\xa1" in Shift_JIS for example. */
711 return false;
712}
713
715
716static inline const char *
717search_nonascii(const char *p, const char *e)
718{
719 const char *s, *t;
720
721#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
722# if SIZEOF_UINTPTR_T == 8
723# define NONASCII_MASK UINT64_C(0x8080808080808080)
724# elif SIZEOF_UINTPTR_T == 4
725# define NONASCII_MASK UINT32_C(0x80808080)
726# else
727# error "don't know what to do."
728# endif
729#else
730# if SIZEOF_UINTPTR_T == 8
731# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
732# elif SIZEOF_UINTPTR_T == 4
733# define NONASCII_MASK 0x80808080UL /* or...? */
734# else
735# error "don't know what to do."
736# endif
737#endif
738
739 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
740#if !UNALIGNED_WORD_ACCESS
741 if ((uintptr_t)p % SIZEOF_VOIDP) {
742 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
743 p += l;
744 switch (l) {
745 default: UNREACHABLE;
746#if SIZEOF_VOIDP > 4
747 case 7: if (p[-7]&0x80) return p-7;
748 case 6: if (p[-6]&0x80) return p-6;
749 case 5: if (p[-5]&0x80) return p-5;
750 case 4: if (p[-4]&0x80) return p-4;
751#endif
752 case 3: if (p[-3]&0x80) return p-3;
753 case 2: if (p[-2]&0x80) return p-2;
754 case 1: if (p[-1]&0x80) return p-1;
755 case 0: break;
756 }
757 }
758#endif
759#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
760#define aligned_ptr(value) \
761 __builtin_assume_aligned((value), sizeof(uintptr_t))
762#else
763#define aligned_ptr(value) (value)
764#endif
765 s = aligned_ptr(p);
766 t = (e - (SIZEOF_VOIDP-1));
767#undef aligned_ptr
768 for (;s < t; s += sizeof(uintptr_t)) {
769 uintptr_t word;
770 memcpy(&word, s, sizeof(word));
771 if (word & NONASCII_MASK) {
772#ifdef WORDS_BIGENDIAN
773 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
774#else
775 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
776#endif
777 }
778 }
779 p = (const char *)s;
780 }
781
782 switch (e - p) {
783 default: UNREACHABLE;
784#if SIZEOF_VOIDP > 4
785 case 7: if (e[-7]&0x80) return e-7;
786 case 6: if (e[-6]&0x80) return e-6;
787 case 5: if (e[-5]&0x80) return e-5;
788 case 4: if (e[-4]&0x80) return e-4;
789#endif
790 case 3: if (e[-3]&0x80) return e-3;
791 case 2: if (e[-2]&0x80) return e-2;
792 case 1: if (e[-1]&0x80) return e-1;
793 case 0: return NULL;
794 }
795}
796
797static int
798coderange_scan(const char *p, long len, rb_encoding *enc)
799{
800 const char *e = p + len;
801
802 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
803 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
804 p = search_nonascii(p, e);
806 }
807
808 if (rb_enc_asciicompat(enc)) {
809 p = search_nonascii(p, e);
810 if (!p) return ENC_CODERANGE_7BIT;
811 for (;;) {
812 int ret = rb_enc_precise_mbclen(p, e, enc);
814 p += MBCLEN_CHARFOUND_LEN(ret);
815 if (p == e) break;
816 p = search_nonascii(p, e);
817 if (!p) break;
818 }
819 }
820 else {
821 while (p < e) {
822 int ret = rb_enc_precise_mbclen(p, e, enc);
824 p += MBCLEN_CHARFOUND_LEN(ret);
825 }
826 }
827 return ENC_CODERANGE_VALID;
828}
829
830long
831rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
832{
833 const char *p = s;
834
835 if (*cr == ENC_CODERANGE_BROKEN)
836 return e - s;
837
838 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
839 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
840 if (*cr == ENC_CODERANGE_VALID) return e - s;
841 p = search_nonascii(p, e);
843 return e - s;
844 }
845 else if (rb_enc_asciicompat(enc)) {
846 p = search_nonascii(p, e);
847 if (!p) {
848 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
849 return e - s;
850 }
851 for (;;) {
852 int ret = rb_enc_precise_mbclen(p, e, enc);
853 if (!MBCLEN_CHARFOUND_P(ret)) {
855 return p - s;
856 }
857 p += MBCLEN_CHARFOUND_LEN(ret);
858 if (p == e) break;
859 p = search_nonascii(p, e);
860 if (!p) break;
861 }
862 }
863 else {
864 while (p < e) {
865 int ret = rb_enc_precise_mbclen(p, e, enc);
866 if (!MBCLEN_CHARFOUND_P(ret)) {
868 return p - s;
869 }
870 p += MBCLEN_CHARFOUND_LEN(ret);
871 }
872 }
874 return e - s;
875}
876
877static inline void
878str_enc_copy(VALUE str1, VALUE str2)
879{
880 rb_enc_set_index(str1, ENCODING_GET(str2));
881}
882
883/* Like str_enc_copy, but does not check frozen status of str1.
884 * You should use this only if you're certain that str1 is not frozen. */
885static inline void
886str_enc_copy_direct(VALUE str1, VALUE str2)
887{
888 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
889 if (inlined_encoding == ENCODING_INLINE_MAX) {
890 rb_enc_set_index(str1, rb_enc_get_index(str2));
891 }
892 else {
893 ENCODING_SET_INLINED(str1, inlined_encoding);
894 }
895}
896
897static void
898rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
899{
900 /* this function is designed for copying encoding and coderange
901 * from src to new string "dest" which is made from the part of src.
902 */
903 str_enc_copy(dest, src);
904 if (RSTRING_LEN(dest) == 0) {
905 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
907 else
909 return;
910 }
911 switch (ENC_CODERANGE(src)) {
914 break;
916 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
917 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
919 else
921 break;
922 default:
923 break;
924 }
925}
926
927static void
928rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
929{
930 str_enc_copy(dest, src);
932}
933
934static int
935enc_coderange_scan(VALUE str, rb_encoding *enc)
936{
937 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
938}
939
940int
941rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
942{
943 return enc_coderange_scan(str, enc);
944}
945
946int
948{
949 int cr = ENC_CODERANGE(str);
950
951 if (cr == ENC_CODERANGE_UNKNOWN) {
952 cr = enc_coderange_scan(str, get_encoding(str));
953 ENC_CODERANGE_SET(str, cr);
954 }
955 return cr;
956}
957
958static inline bool
959rb_enc_str_asciicompat(VALUE str)
960{
961 int encindex = ENCODING_GET_INLINED(str);
962 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
963}
964
965int
967{
968 switch(ENC_CODERANGE(str)) {
970 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
972 return true;
973 default:
974 return false;
975 }
976}
977
978static inline void
979str_mod_check(VALUE s, const char *p, long len)
980{
981 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
982 rb_raise(rb_eRuntimeError, "string modified");
983 }
984}
985
986static size_t
987str_capacity(VALUE str, const int termlen)
988{
989 if (STR_EMBED_P(str)) {
990 return str_embed_capa(str) - termlen;
991 }
992 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
993 return RSTRING(str)->len;
994 }
995 else {
996 return RSTRING(str)->as.heap.aux.capa;
997 }
998}
999
1000size_t
1002{
1003 return str_capacity(str, TERM_LEN(str));
1004}
1005
1006static inline void
1007must_not_null(const char *ptr)
1008{
1009 if (!ptr) {
1010 rb_raise(rb_eArgError, "NULL pointer given");
1011 }
1012}
1013
1014static inline VALUE
1015str_alloc_embed(VALUE klass, size_t capa)
1016{
1017 size_t size = rb_str_embed_size(capa, 0);
1018 RUBY_ASSERT(size > 0);
1019 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1020
1021 NEWOBJ_OF(str, struct RString, klass,
1023
1024 str->len = 0;
1025 str->as.embed.ary[0] = 0;
1026
1027 return (VALUE)str;
1028}
1029
1030static inline VALUE
1031str_alloc_heap(VALUE klass)
1032{
1033 NEWOBJ_OF(str, struct RString, klass,
1034 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1035
1036 str->len = 0;
1037 str->as.heap.aux.capa = 0;
1038 str->as.heap.ptr = NULL;
1039
1040 return (VALUE)str;
1041}
1042
1043static inline VALUE
1044empty_str_alloc(VALUE klass)
1045{
1046 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1047 VALUE str = str_alloc_embed(klass, 0);
1048 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1050 return str;
1051}
1052
1053static VALUE
1054str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1055{
1056 VALUE str;
1057
1058 if (len < 0) {
1059 rb_raise(rb_eArgError, "negative string size (or size too big)");
1060 }
1061
1062 if (enc == NULL) {
1063 enc = rb_ascii8bit_encoding();
1064 }
1065
1066 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1067
1068 int termlen = rb_enc_mbminlen(enc);
1069
1070 if (STR_EMBEDDABLE_P(len, termlen)) {
1071 str = str_alloc_embed(klass, len + termlen);
1072 if (len == 0) {
1073 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1074 }
1075 }
1076 else {
1077 str = str_alloc_heap(klass);
1078 RSTRING(str)->as.heap.aux.capa = len;
1079 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1080 * integer overflow. If we can STATIC_ASSERT that, the following
1081 * mul_add_mul can be reverted to a simple ALLOC_N. */
1082 RSTRING(str)->as.heap.ptr =
1083 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1084 }
1085
1086 rb_enc_raw_set(str, enc);
1087
1088 if (ptr) {
1089 memcpy(RSTRING_PTR(str), ptr, len);
1090 }
1091 else {
1092 memset(RSTRING_PTR(str), 0, len);
1093 }
1094
1095 STR_SET_LEN(str, len);
1096 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1097 return str;
1098}
1099
1100static VALUE
1101str_new(VALUE klass, const char *ptr, long len)
1102{
1103 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1104}
1105
1106VALUE
1107rb_str_new(const char *ptr, long len)
1108{
1109 return str_new(rb_cString, ptr, len);
1110}
1111
1112VALUE
1113rb_usascii_str_new(const char *ptr, long len)
1114{
1115 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1116}
1117
1118VALUE
1119rb_utf8_str_new(const char *ptr, long len)
1120{
1121 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1122}
1123
1124VALUE
1125rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1126{
1127 return str_enc_new(rb_cString, ptr, len, enc);
1128}
1129
1130VALUE
1131rb_str_new_cstr(const char *ptr)
1132{
1133 must_not_null(ptr);
1134 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1135 * memory regions, and that cannot be detected by the MSAN. Just
1136 * trust the programmer that the argument passed here is a sane C
1137 * string. */
1138 __msan_unpoison_string(ptr);
1139 return rb_str_new(ptr, strlen(ptr));
1140}
1141
1142VALUE
1144{
1146}
1147
1148VALUE
1149rb_utf8_str_new_cstr(const char *ptr)
1150{
1152}
1153
1154VALUE
1155rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1156{
1157 must_not_null(ptr);
1158 if (rb_enc_mbminlen(enc) != 1) {
1159 rb_raise(rb_eArgError, "wchar encoding given");
1160 }
1161 return rb_enc_str_new(ptr, strlen(ptr), enc);
1162}
1163
1164static VALUE
1165str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1166{
1167 VALUE str;
1168
1169 if (len < 0) {
1170 rb_raise(rb_eArgError, "negative string size (or size too big)");
1171 }
1172
1173 if (!ptr) {
1174 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1175 }
1176 else {
1177 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1178 str = str_alloc_heap(klass);
1179 RSTRING(str)->len = len;
1180 RSTRING(str)->as.heap.ptr = (char *)ptr;
1181 RSTRING(str)->as.heap.aux.capa = len;
1182 RBASIC(str)->flags |= STR_NOFREE;
1183 rb_enc_associate_index(str, encindex);
1184 }
1185 return str;
1186}
1187
1188VALUE
1189rb_str_new_static(const char *ptr, long len)
1190{
1191 return str_new_static(rb_cString, ptr, len, 0);
1192}
1193
1194VALUE
1195rb_usascii_str_new_static(const char *ptr, long len)
1196{
1197 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1198}
1199
1200VALUE
1201rb_utf8_str_new_static(const char *ptr, long len)
1202{
1203 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1204}
1205
1206VALUE
1207rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1208{
1209 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1210}
1211
1212static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1213 rb_encoding *from, rb_encoding *to,
1214 int ecflags, VALUE ecopts);
1215
1216static inline bool
1217is_enc_ascii_string(VALUE str, rb_encoding *enc)
1218{
1219 int encidx = rb_enc_to_index(enc);
1220 if (rb_enc_get_index(str) == encidx)
1221 return is_ascii_string(str);
1222 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1223}
1224
1225VALUE
1226rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1227{
1228 long len;
1229 const char *ptr;
1230 VALUE newstr;
1231
1232 if (!to) return str;
1233 if (!from) from = rb_enc_get(str);
1234 if (from == to) return str;
1235 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1236 rb_is_ascii8bit_enc(to)) {
1237 if (STR_ENC_GET(str) != to) {
1238 str = rb_str_dup(str);
1239 rb_enc_associate(str, to);
1240 }
1241 return str;
1242 }
1243
1244 RSTRING_GETMEM(str, ptr, len);
1245 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1246 from, to, ecflags, ecopts);
1247 if (NIL_P(newstr)) {
1248 /* some error, return original */
1249 return str;
1250 }
1251 return newstr;
1252}
1253
1254VALUE
1255rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1256 rb_encoding *from, int ecflags, VALUE ecopts)
1257{
1258 long olen;
1259
1260 olen = RSTRING_LEN(newstr);
1261 if (ofs < -olen || olen < ofs)
1262 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1263 if (ofs < 0) ofs += olen;
1264 if (!from) {
1265 STR_SET_LEN(newstr, ofs);
1266 return rb_str_cat(newstr, ptr, len);
1267 }
1268
1269 rb_str_modify(newstr);
1270 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1271 rb_enc_get(newstr),
1272 ecflags, ecopts);
1273}
1274
1275VALUE
1276rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1277{
1278 STR_SET_LEN(str, 0);
1279 rb_enc_associate(str, enc);
1280 rb_str_cat(str, ptr, len);
1281 return str;
1282}
1283
1284static VALUE
1285str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1286 rb_encoding *from, rb_encoding *to,
1287 int ecflags, VALUE ecopts)
1288{
1289 rb_econv_t *ec;
1291 long olen;
1292 VALUE econv_wrapper;
1293 const unsigned char *start, *sp;
1294 unsigned char *dest, *dp;
1295 size_t converted_output = (size_t)ofs;
1296
1297 olen = rb_str_capacity(newstr);
1298
1299 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1300 RBASIC_CLEAR_CLASS(econv_wrapper);
1301 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1302 if (!ec) return Qnil;
1303 DATA_PTR(econv_wrapper) = ec;
1304
1305 sp = (unsigned char*)ptr;
1306 start = sp;
1307 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1308 (dp = dest + converted_output),
1309 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1311 /* destination buffer short */
1312 size_t converted_input = sp - start;
1313 size_t rest = len - converted_input;
1314 converted_output = dp - dest;
1315 rb_str_set_len(newstr, converted_output);
1316 if (converted_input && converted_output &&
1317 rest < (LONG_MAX / converted_output)) {
1318 rest = (rest * converted_output) / converted_input;
1319 }
1320 else {
1321 rest = olen;
1322 }
1323 olen += rest < 2 ? 2 : rest;
1324 rb_str_resize(newstr, olen);
1325 }
1326 DATA_PTR(econv_wrapper) = 0;
1327 RB_GC_GUARD(econv_wrapper);
1328 rb_econv_close(ec);
1329 switch (ret) {
1330 case econv_finished:
1331 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1332 rb_str_set_len(newstr, len);
1333 rb_enc_associate(newstr, to);
1334 return newstr;
1335
1336 default:
1337 return Qnil;
1338 }
1339}
1340
1341VALUE
1342rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1343{
1344 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1345}
1346
1347VALUE
1348rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1349{
1350 rb_encoding *ienc;
1351 VALUE str;
1352 const int eidx = rb_enc_to_index(eenc);
1353
1354 if (!ptr) {
1355 return rb_enc_str_new(ptr, len, eenc);
1356 }
1357
1358 /* ASCII-8BIT case, no conversion */
1359 if ((eidx == rb_ascii8bit_encindex()) ||
1360 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1361 return rb_str_new(ptr, len);
1362 }
1363 /* no default_internal or same encoding, no conversion */
1365 if (!ienc || eenc == ienc) {
1366 return rb_enc_str_new(ptr, len, eenc);
1367 }
1368 /* ASCII compatible, and ASCII only string, no conversion in
1369 * default_internal */
1370 if ((eidx == rb_ascii8bit_encindex()) ||
1371 (eidx == rb_usascii_encindex()) ||
1372 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1373 return rb_enc_str_new(ptr, len, ienc);
1374 }
1375 /* convert from the given encoding to default_internal */
1376 str = rb_enc_str_new(NULL, 0, ienc);
1377 /* when the conversion failed for some reason, just ignore the
1378 * default_internal and result in the given encoding as-is. */
1379 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1380 rb_str_initialize(str, ptr, len, eenc);
1381 }
1382 return str;
1383}
1384
1385VALUE
1386rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1387{
1388 int eidx = rb_enc_to_index(eenc);
1389 if (eidx == rb_usascii_encindex() &&
1390 !is_ascii_string(str)) {
1391 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1392 return str;
1393 }
1394 rb_enc_associate_index(str, eidx);
1395 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1396}
1397
1398VALUE
1399rb_external_str_new(const char *ptr, long len)
1400{
1402}
1403
1404VALUE
1406{
1408}
1409
1410VALUE
1411rb_locale_str_new(const char *ptr, long len)
1412{
1414}
1415
1416VALUE
1418{
1419 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1420}
1421
1422VALUE
1423rb_filesystem_str_new(const char *ptr, long len)
1424{
1426}
1427
1428VALUE
1430{
1431 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1432}
1433
1434VALUE
1439
1440VALUE
1445
1446VALUE
1447rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1448{
1449 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1450}
1451
1452static VALUE
1453str_replace_shared_without_enc(VALUE str2, VALUE str)
1454{
1455 const int termlen = TERM_LEN(str);
1456 char *ptr;
1457 long len;
1458
1459 RSTRING_GETMEM(str, ptr, len);
1460 if (str_embed_capa(str2) >= len + termlen) {
1461 char *ptr2 = RSTRING(str2)->as.embed.ary;
1462 STR_SET_EMBED(str2);
1463 memcpy(ptr2, RSTRING_PTR(str), len);
1464 TERM_FILL(ptr2+len, termlen);
1465 }
1466 else {
1467 VALUE root;
1468 if (STR_SHARED_P(str)) {
1469 root = RSTRING(str)->as.heap.aux.shared;
1470 RSTRING_GETMEM(str, ptr, len);
1471 }
1472 else {
1473 root = rb_str_new_frozen(str);
1474 RSTRING_GETMEM(root, ptr, len);
1475 }
1476 RUBY_ASSERT(OBJ_FROZEN(root));
1477
1478 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1479 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1480 rb_fatal("about to free a possible shared root");
1481 }
1482 char *ptr2 = STR_HEAP_PTR(str2);
1483 if (ptr2 != ptr) {
1484 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1485 }
1486 }
1487 FL_SET(str2, STR_NOEMBED);
1488 RSTRING(str2)->as.heap.ptr = ptr;
1489 STR_SET_SHARED(str2, root);
1490 }
1491
1492 STR_SET_LEN(str2, len);
1493
1494 return str2;
1495}
1496
1497static VALUE
1498str_replace_shared(VALUE str2, VALUE str)
1499{
1500 str_replace_shared_without_enc(str2, str);
1501 rb_enc_cr_str_exact_copy(str2, str);
1502 return str2;
1503}
1504
1505static VALUE
1506str_new_shared(VALUE klass, VALUE str)
1507{
1508 return str_replace_shared(str_alloc_heap(klass), str);
1509}
1510
1511VALUE
1513{
1514 return str_new_shared(rb_obj_class(str), str);
1515}
1516
1517VALUE
1519{
1520 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1521 return str_new_frozen(rb_obj_class(orig), orig);
1522}
1523
1524static VALUE
1525rb_str_new_frozen_String(VALUE orig)
1526{
1527 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1528 return str_new_frozen(rb_cString, orig);
1529}
1530
1531
1532VALUE
1533rb_str_frozen_bare_string(VALUE orig)
1534{
1535 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1536 return str_new_frozen(rb_cString, orig);
1537}
1538
1539VALUE
1540rb_str_tmp_frozen_acquire(VALUE orig)
1541{
1542 if (OBJ_FROZEN_RAW(orig)) return orig;
1543 return str_new_frozen_buffer(0, orig, FALSE);
1544}
1545
1546VALUE
1547rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1548{
1549 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1550 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1551
1552 VALUE str = str_alloc_heap(0);
1553 OBJ_FREEZE(str);
1554 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1555 FL_SET(str, STR_SHARED_ROOT);
1556
1557 size_t capa = str_capacity(orig, TERM_LEN(orig));
1558
1559 /* If the string is embedded then we want to create a copy that is heap
1560 * allocated. If the string is shared then the shared root must be
1561 * embedded, so we want to create a copy. If the string is a shared root
1562 * then it must be embedded, so we want to create a copy. */
1563 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1564 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1565 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1566 }
1567 else {
1568 /* orig must be heap allocated and not shared, so we can safely transfer
1569 * the pointer to str. */
1570 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1571 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1572 RBASIC(orig)->flags &= ~STR_NOFREE;
1573 STR_SET_SHARED(orig, str);
1574 if (RB_OBJ_SHAREABLE_P(orig)) {
1575 RB_OBJ_SET_SHAREABLE(str);
1576 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1577 }
1578 }
1579
1580 RSTRING(str)->len = RSTRING(orig)->len;
1581 RSTRING(str)->as.heap.aux.capa = capa;
1582
1583 return str;
1584}
1585
1586void
1587rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1588{
1589 if (RBASIC_CLASS(tmp) != 0)
1590 return;
1591
1592 if (STR_EMBED_P(tmp)) {
1594 }
1595 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1596 !OBJ_FROZEN_RAW(orig)) {
1597 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1598
1599 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1600 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1601 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1602
1603 /* Unshare orig since the root (tmp) only has this one child. */
1604 FL_UNSET_RAW(orig, STR_SHARED);
1605 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1606 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1608
1609 /* Make tmp embedded and empty so it is safe for sweeping. */
1610 STR_SET_EMBED(tmp);
1611 STR_SET_LEN(tmp, 0);
1612 }
1613 }
1614}
1615
1616static VALUE
1617str_new_frozen(VALUE klass, VALUE orig)
1618{
1619 return str_new_frozen_buffer(klass, orig, TRUE);
1620}
1621
1622static VALUE
1623heap_str_make_shared(VALUE klass, VALUE orig)
1624{
1625 RUBY_ASSERT(!STR_EMBED_P(orig));
1626 RUBY_ASSERT(!STR_SHARED_P(orig));
1628
1629 VALUE str = str_alloc_heap(klass);
1630 STR_SET_LEN(str, RSTRING_LEN(orig));
1631 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1632 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1633 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1634 RBASIC(orig)->flags &= ~STR_NOFREE;
1635 STR_SET_SHARED(orig, str);
1636 if (klass == 0)
1637 FL_UNSET_RAW(str, STR_BORROWED);
1638 return str;
1639}
1640
1641static VALUE
1642str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1643{
1644 VALUE str;
1645
1646 long len = RSTRING_LEN(orig);
1647 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1648 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1649
1650 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1651 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1652 RUBY_ASSERT(STR_EMBED_P(str));
1653 }
1654 else {
1655 if (FL_TEST_RAW(orig, STR_SHARED)) {
1656 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1657 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1658 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1659 RUBY_ASSERT(ofs >= 0);
1660 RUBY_ASSERT(rest >= 0);
1661 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1663
1664 if ((ofs > 0) || (rest > 0) ||
1665 (klass != RBASIC(shared)->klass) ||
1666 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1667 str = str_new_shared(klass, shared);
1668 RUBY_ASSERT(!STR_EMBED_P(str));
1669 RSTRING(str)->as.heap.ptr += ofs;
1670 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1671 }
1672 else {
1673 if (RBASIC_CLASS(shared) == 0)
1674 FL_SET_RAW(shared, STR_BORROWED);
1675 return shared;
1676 }
1677 }
1678 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1679 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1680 STR_SET_EMBED(str);
1681 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1682 STR_SET_LEN(str, RSTRING_LEN(orig));
1683 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1684 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1685 }
1686 else {
1687 if (RB_OBJ_SHAREABLE_P(orig)) {
1688 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1689 }
1690 else {
1691 str = heap_str_make_shared(klass, orig);
1692 }
1693 }
1694 }
1695
1696 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1697 OBJ_FREEZE(str);
1698 return str;
1699}
1700
1701VALUE
1702rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1703{
1704 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1705}
1706
1707static VALUE
1708str_new_empty_String(VALUE str)
1709{
1710 VALUE v = rb_str_new(0, 0);
1711 rb_enc_copy(v, str);
1712 return v;
1713}
1714
1715#define STR_BUF_MIN_SIZE 63
1716
1717VALUE
1719{
1720 if (STR_EMBEDDABLE_P(capa, 1)) {
1721 return str_alloc_embed(rb_cString, capa + 1);
1722 }
1723
1724 VALUE str = str_alloc_heap(rb_cString);
1725
1726 RSTRING(str)->as.heap.aux.capa = capa;
1727 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1728 RSTRING(str)->as.heap.ptr[0] = '\0';
1729
1730 return str;
1731}
1732
1733VALUE
1734rb_str_buf_new_cstr(const char *ptr)
1735{
1736 VALUE str;
1737 long len = strlen(ptr);
1738
1739 str = rb_str_buf_new(len);
1740 rb_str_buf_cat(str, ptr, len);
1741
1742 return str;
1743}
1744
1745VALUE
1747{
1748 return str_new(0, 0, len);
1749}
1750
1751void
1753{
1754 if (STR_EMBED_P(str)) {
1755 RB_DEBUG_COUNTER_INC(obj_str_embed);
1756 }
1757 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1758 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1759 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1760 }
1761 else {
1762 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1763 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1764 }
1765}
1766
1767size_t
1768rb_str_memsize(VALUE str)
1769{
1770 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1771 return STR_HEAP_SIZE(str);
1772 }
1773 else {
1774 return 0;
1775 }
1776}
1777
1778VALUE
1780{
1781 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1782}
1783
1784static inline void str_discard(VALUE str);
1785static void str_shared_replace(VALUE str, VALUE str2);
1786
1787void
1789{
1790 if (str != str2) str_shared_replace(str, str2);
1791}
1792
1793static void
1794str_shared_replace(VALUE str, VALUE str2)
1795{
1796 rb_encoding *enc;
1797 int cr;
1798 int termlen;
1799
1800 RUBY_ASSERT(str2 != str);
1801 enc = STR_ENC_GET(str2);
1802 cr = ENC_CODERANGE(str2);
1803 str_discard(str);
1804 termlen = rb_enc_mbminlen(enc);
1805
1806 STR_SET_LEN(str, RSTRING_LEN(str2));
1807
1808 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1809 STR_SET_EMBED(str);
1810 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1811 rb_enc_associate(str, enc);
1812 ENC_CODERANGE_SET(str, cr);
1813 }
1814 else {
1815 if (STR_EMBED_P(str2)) {
1816 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1817 long len = RSTRING_LEN(str2);
1818 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1819
1820 char *new_ptr = ALLOC_N(char, len + termlen);
1821 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1822 RSTRING(str2)->as.heap.ptr = new_ptr;
1823 STR_SET_LEN(str2, len);
1824 RSTRING(str2)->as.heap.aux.capa = len;
1825 STR_SET_NOEMBED(str2);
1826 }
1827
1828 STR_SET_NOEMBED(str);
1829 FL_UNSET(str, STR_SHARED);
1830 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1831
1832 if (FL_TEST(str2, STR_SHARED)) {
1833 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1834 STR_SET_SHARED(str, shared);
1835 }
1836 else {
1837 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1838 }
1839
1840 /* abandon str2 */
1841 STR_SET_EMBED(str2);
1842 RSTRING_PTR(str2)[0] = 0;
1843 STR_SET_LEN(str2, 0);
1844 rb_enc_associate(str, enc);
1845 ENC_CODERANGE_SET(str, cr);
1846 }
1847}
1848
1849VALUE
1851{
1852 VALUE str;
1853
1854 if (RB_TYPE_P(obj, T_STRING)) {
1855 return obj;
1856 }
1857 str = rb_funcall(obj, idTo_s, 0);
1858 return rb_obj_as_string_result(str, obj);
1859}
1860
1861VALUE
1862rb_obj_as_string_result(VALUE str, VALUE obj)
1863{
1864 if (!RB_TYPE_P(str, T_STRING))
1865 return rb_any_to_s(obj);
1866 return str;
1867}
1868
1869static VALUE
1870str_replace(VALUE str, VALUE str2)
1871{
1872 long len;
1873
1874 len = RSTRING_LEN(str2);
1875 if (STR_SHARED_P(str2)) {
1876 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1878 STR_SET_NOEMBED(str);
1879 STR_SET_LEN(str, len);
1880 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1881 STR_SET_SHARED(str, shared);
1882 rb_enc_cr_str_exact_copy(str, str2);
1883 }
1884 else {
1885 str_replace_shared(str, str2);
1886 }
1887
1888 return str;
1889}
1890
1891static inline VALUE
1892ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1893{
1894 size_t size = rb_str_embed_size(capa, 0);
1895 RUBY_ASSERT(size > 0);
1896 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1897
1898 NEWOBJ_OF(str, struct RString, klass,
1900
1901 str->len = 0;
1902
1903 return (VALUE)str;
1904}
1905
1906static inline VALUE
1907ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1908{
1909 NEWOBJ_OF(str, struct RString, klass,
1910 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1911
1912 str->as.heap.aux.capa = 0;
1913 str->as.heap.ptr = NULL;
1914
1915 return (VALUE)str;
1916}
1917
1918static inline VALUE
1919str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1920{
1921 int encidx = 0;
1922 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1923 encidx = rb_enc_get_index(str);
1924 flags &= ~ENCODING_MASK;
1925 }
1926 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1927 if (encidx) rb_enc_associate_index(dup, encidx);
1928 return dup;
1929}
1930
1931static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1932
1933static inline VALUE
1934str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1935{
1936 VALUE flags = FL_TEST_RAW(str, flag_mask);
1937 long len = RSTRING_LEN(str);
1938
1939 RUBY_ASSERT(STR_EMBED_P(dup));
1940 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1941 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1942 STR_SET_LEN(dup, RSTRING_LEN(str));
1943 return str_duplicate_setup_encoding(str, dup, flags);
1944}
1945
1946static inline VALUE
1947str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1948{
1949 VALUE flags = FL_TEST_RAW(str, flag_mask);
1950 VALUE root = str;
1951 if (FL_TEST_RAW(str, STR_SHARED)) {
1952 root = RSTRING(str)->as.heap.aux.shared;
1953 }
1954 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1955 root = str = str_new_frozen(klass, str);
1956 flags = FL_TEST_RAW(str, flag_mask);
1957 }
1958 RUBY_ASSERT(!STR_SHARED_P(root));
1960
1961 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1962 FL_SET(root, STR_SHARED_ROOT);
1963 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1964 flags |= RSTRING_NOEMBED | STR_SHARED;
1965
1966 STR_SET_LEN(dup, RSTRING_LEN(str));
1967 return str_duplicate_setup_encoding(str, dup, flags);
1968}
1969
1970static inline VALUE
1971str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1972{
1973 if (STR_EMBED_P(str)) {
1974 return str_duplicate_setup_embed(klass, str, dup);
1975 }
1976 else {
1977 return str_duplicate_setup_heap(klass, str, dup);
1978 }
1979}
1980
1981static inline VALUE
1982str_duplicate(VALUE klass, VALUE str)
1983{
1984 VALUE dup;
1985 if (STR_EMBED_P(str)) {
1986 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 }
1988 else {
1989 dup = str_alloc_heap(klass);
1990 }
1991
1992 return str_duplicate_setup(klass, str, dup);
1993}
1994
1995VALUE
1997{
1998 return str_duplicate(rb_obj_class(str), str);
1999}
2000
2001/* :nodoc: */
2002VALUE
2003rb_str_dup_m(VALUE str)
2004{
2005 if (LIKELY(BARE_STRING_P(str))) {
2006 return str_duplicate(rb_cString, str);
2007 }
2008 else {
2009 return rb_obj_dup(str);
2010 }
2011}
2012
2013VALUE
2015{
2016 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2017 return str_duplicate(rb_cString, str);
2018}
2019
2020VALUE
2021rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2022{
2023 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2024 VALUE new_str, klass = rb_cString;
2025
2026 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2027 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2028 str_duplicate_setup_embed(klass, str, new_str);
2029 }
2030 else {
2031 new_str = ec_str_alloc_heap(ec, klass);
2032 str_duplicate_setup_heap(klass, str, new_str);
2033 }
2034 if (chilled) {
2035 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2036 }
2037 return new_str;
2038}
2039
2040VALUE
2041rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2042{
2043 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2044 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2045 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2046 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2047 return rb_str_freeze(str);
2048}
2049
2050/*
2051 * The documentation block below uses an include (instead of inline text)
2052 * because the included text has non-ASCII characters (which are not allowed in a C file).
2053 */
2054
2055/*
2056 *
2057 * call-seq:
2058 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2059 *
2060 * :include: doc/string/new.rdoc
2061 *
2062 */
2063
2064static VALUE
2065rb_str_init(int argc, VALUE *argv, VALUE str)
2066{
2067 static ID keyword_ids[2];
2068 VALUE orig, opt, venc, vcapa;
2069 VALUE kwargs[2];
2070 rb_encoding *enc = 0;
2071 int n;
2072
2073 if (!keyword_ids[0]) {
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1], "capacity");
2076 }
2077
2078 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2079 if (!NIL_P(opt)) {
2080 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2081 venc = kwargs[0];
2082 vcapa = kwargs[1];
2083 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2084 enc = rb_to_encoding(venc);
2085 }
2086 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2087 long capa = NUM2LONG(vcapa);
2088 long len = 0;
2089 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2090
2091 if (capa < STR_BUF_MIN_SIZE) {
2092 capa = STR_BUF_MIN_SIZE;
2093 }
2094 if (n == 1) {
2095 StringValue(orig);
2096 len = RSTRING_LEN(orig);
2097 if (capa < len) {
2098 capa = len;
2099 }
2100 if (orig == str) n = 0;
2101 }
2102 str_modifiable(str);
2103 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2104 /* make noembed always */
2105 const size_t size = (size_t)capa + termlen;
2106 const char *const old_ptr = RSTRING_PTR(str);
2107 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2108 char *new_ptr = ALLOC_N(char, size);
2109 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2110 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2111 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2112 RSTRING(str)->as.heap.ptr = new_ptr;
2113 }
2114 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2115 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2116 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2117 }
2118 STR_SET_LEN(str, len);
2119 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2120 if (n == 1) {
2121 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2122 rb_enc_cr_str_exact_copy(str, orig);
2123 }
2124 FL_SET(str, STR_NOEMBED);
2125 RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127 else if (n == 1) {
2128 rb_str_replace(str, orig);
2129 }
2130 if (enc) {
2131 rb_enc_associate(str, enc);
2133 }
2134 }
2135 else if (n == 1) {
2136 rb_str_replace(str, orig);
2137 }
2138 return str;
2139}
2140
2141/* :nodoc: */
2142static VALUE
2143rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2144{
2145 if (klass != rb_cString) {
2146 return rb_class_new_instance_pass_kw(argc, argv, klass);
2147 }
2148
2149 static ID keyword_ids[2];
2150 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2151 VALUE kwargs[2];
2152 rb_encoding *enc = NULL;
2153
2154 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2155 if (NIL_P(opt)) {
2156 return rb_class_new_instance_pass_kw(argc, argv, klass);
2157 }
2158
2159 keyword_ids[0] = rb_id_encoding();
2160 CONST_ID(keyword_ids[1], "capacity");
2161 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2162 encoding = kwargs[0];
2163 capacity = kwargs[1];
2164
2165 if (n == 1) {
2166 orig = StringValue(orig);
2167 }
2168 else {
2169 orig = Qnil;
2170 }
2171
2172 if (UNDEF_P(encoding)) {
2173 if (!NIL_P(orig)) {
2174 encoding = rb_obj_encoding(orig);
2175 }
2176 }
2177
2178 if (!UNDEF_P(encoding)) {
2179 enc = rb_to_encoding(encoding);
2180 }
2181
2182 // If capacity is nil, we're basically just duping `orig`.
2183 if (UNDEF_P(capacity)) {
2184 if (NIL_P(orig)) {
2185 VALUE empty_str = str_new(klass, "", 0);
2186 if (enc) {
2187 rb_enc_associate(empty_str, enc);
2188 }
2189 return empty_str;
2190 }
2191 VALUE copy = str_duplicate(klass, orig);
2192 rb_enc_associate(copy, enc);
2193 ENC_CODERANGE_CLEAR(copy);
2194 return copy;
2195 }
2196
2197 long capa = 0;
2198 capa = NUM2LONG(capacity);
2199 if (capa < 0) {
2200 capa = 0;
2201 }
2202
2203 if (!NIL_P(orig)) {
2204 long orig_capa = rb_str_capacity(orig);
2205 if (orig_capa > capa) {
2206 capa = orig_capa;
2207 }
2208 }
2209
2210 VALUE str = str_enc_new(klass, NULL, capa, enc);
2211 STR_SET_LEN(str, 0);
2212 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2213
2214 if (!NIL_P(orig)) {
2215 rb_str_buf_append(str, orig);
2216 }
2217
2218 return str;
2219}
2220
2221#ifdef NONASCII_MASK
2222#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2223
2224/*
2225 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2226 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2227 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2228 *
2229 * if (!(byte & 0x80))
2230 * byte |= 0x40; // turn on bit6
2231 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2232 *
2233 * This function calculates whether a byte is leading or not for all bytes
2234 * in the argument word by concurrently using the above logic, and then
2235 * adds up the number of leading bytes in the word.
2236 */
2237static inline uintptr_t
2238count_utf8_lead_bytes_with_word(const uintptr_t *s)
2239{
2240 uintptr_t d = *s;
2241
2242 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2243 d = (d>>6) | (~d>>7);
2244 d &= NONASCII_MASK >> 7;
2245
2246 /* Gather all bytes. */
2247#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2248 /* use only if it can use POPCNT */
2249 return rb_popcount_intptr(d);
2250#else
2251 d += (d>>8);
2252 d += (d>>16);
2253# if SIZEOF_VOIDP == 8
2254 d += (d>>32);
2255# endif
2256 return (d&0xF);
2257#endif
2258}
2259#endif
2260
2261static inline long
2262enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2263{
2264 long c;
2265 const char *q;
2266
2267 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2268 long diff = (long)(e - p);
2269 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270 }
2271#ifdef NONASCII_MASK
2272 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2273 uintptr_t len = 0;
2274 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2275 const uintptr_t *s, *t;
2276 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2277 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2278 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2279 while (p < (const char *)s) {
2280 if (is_utf8_lead_byte(*p)) len++;
2281 p++;
2282 }
2283 while (s < t) {
2284 len += count_utf8_lead_bytes_with_word(s);
2285 s++;
2286 }
2287 p = (const char *)s;
2288 }
2289 while (p < e) {
2290 if (is_utf8_lead_byte(*p)) len++;
2291 p++;
2292 }
2293 return (long)len;
2294 }
2295#endif
2296 else if (rb_enc_asciicompat(enc)) {
2297 c = 0;
2298 if (ENC_CODERANGE_CLEAN_P(cr)) {
2299 while (p < e) {
2300 if (ISASCII(*p)) {
2301 q = search_nonascii(p, e);
2302 if (!q)
2303 return c + (e - p);
2304 c += q - p;
2305 p = q;
2306 }
2307 p += rb_enc_fast_mbclen(p, e, enc);
2308 c++;
2309 }
2310 }
2311 else {
2312 while (p < e) {
2313 if (ISASCII(*p)) {
2314 q = search_nonascii(p, e);
2315 if (!q)
2316 return c + (e - p);
2317 c += q - p;
2318 p = q;
2319 }
2320 p += rb_enc_mbclen(p, e, enc);
2321 c++;
2322 }
2323 }
2324 return c;
2325 }
2326
2327 for (c=0; p<e; c++) {
2328 p += rb_enc_mbclen(p, e, enc);
2329 }
2330 return c;
2331}
2332
2333long
2334rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2335{
2336 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2337}
2338
2339/* To get strlen with cr
2340 * Note that given cr is not used.
2341 */
2342long
2343rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2344{
2345 long c;
2346 const char *q;
2347 int ret;
2348
2349 *cr = 0;
2350 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2351 long diff = (long)(e - p);
2352 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2353 }
2354 else if (rb_enc_asciicompat(enc)) {
2355 c = 0;
2356 while (p < e) {
2357 if (ISASCII(*p)) {
2358 q = search_nonascii(p, e);
2359 if (!q) {
2360 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2361 return c + (e - p);
2362 }
2363 c += q - p;
2364 p = q;
2365 }
2366 ret = rb_enc_precise_mbclen(p, e, enc);
2367 if (MBCLEN_CHARFOUND_P(ret)) {
2368 *cr |= ENC_CODERANGE_VALID;
2369 p += MBCLEN_CHARFOUND_LEN(ret);
2370 }
2371 else {
2373 p++;
2374 }
2375 c++;
2376 }
2377 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2378 return c;
2379 }
2380
2381 for (c=0; p<e; c++) {
2382 ret = rb_enc_precise_mbclen(p, e, enc);
2383 if (MBCLEN_CHARFOUND_P(ret)) {
2384 *cr |= ENC_CODERANGE_VALID;
2385 p += MBCLEN_CHARFOUND_LEN(ret);
2386 }
2387 else {
2389 if (p + rb_enc_mbminlen(enc) <= e)
2390 p += rb_enc_mbminlen(enc);
2391 else
2392 p = e;
2393 }
2394 }
2395 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2396 return c;
2397}
2398
2399/* enc must be str's enc or rb_enc_check(str, str2) */
2400static long
2401str_strlen(VALUE str, rb_encoding *enc)
2402{
2403 const char *p, *e;
2404 int cr;
2405
2406 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2407 if (!enc) enc = STR_ENC_GET(str);
2408 p = RSTRING_PTR(str);
2409 e = RSTRING_END(str);
2410 cr = ENC_CODERANGE(str);
2411
2412 if (cr == ENC_CODERANGE_UNKNOWN) {
2413 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2414 if (cr) ENC_CODERANGE_SET(str, cr);
2415 return n;
2416 }
2417 else {
2418 return enc_strlen(p, e, enc, cr);
2419 }
2420}
2421
2422long
2424{
2425 return str_strlen(str, NULL);
2426}
2427
2428/*
2429 * call-seq:
2430 * length -> integer
2431 *
2432 * :include: doc/string/length.rdoc
2433 *
2434 */
2435
2436VALUE
2438{
2439 return LONG2NUM(str_strlen(str, NULL));
2440}
2441
2442/*
2443 * call-seq:
2444 * bytesize -> integer
2445 *
2446 * :include: doc/string/bytesize.rdoc
2447 *
2448 */
2449
2450VALUE
2451rb_str_bytesize(VALUE str)
2452{
2453 return LONG2NUM(RSTRING_LEN(str));
2454}
2455
2456/*
2457 * call-seq:
2458 * empty? -> true or false
2459 *
2460 * Returns whether the length of +self+ is zero:
2461 *
2462 * 'hello'.empty? # => false
2463 * ' '.empty? # => false
2464 * ''.empty? # => true
2465 *
2466 * Related: see {Querying}[rdoc-ref:String@Querying].
2467 */
2468
2469static VALUE
2470rb_str_empty(VALUE str)
2471{
2472 return RBOOL(RSTRING_LEN(str) == 0);
2473}
2474
2475/*
2476 * call-seq:
2477 * self + other_string -> new_string
2478 *
2479 * Returns a new string containing +other_string+ concatenated to +self+:
2480 *
2481 * 'Hello from ' + self.to_s # => "Hello from main"
2482 *
2483 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2484 */
2485
2486VALUE
2488{
2489 VALUE str3;
2490 rb_encoding *enc;
2491 char *ptr1, *ptr2, *ptr3;
2492 long len1, len2;
2493 int termlen;
2494
2495 StringValue(str2);
2496 enc = rb_enc_check_str(str1, str2);
2497 RSTRING_GETMEM(str1, ptr1, len1);
2498 RSTRING_GETMEM(str2, ptr2, len2);
2499 termlen = rb_enc_mbminlen(enc);
2500 if (len1 > LONG_MAX - len2) {
2501 rb_raise(rb_eArgError, "string size too big");
2502 }
2503 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2504 ptr3 = RSTRING_PTR(str3);
2505 memcpy(ptr3, ptr1, len1);
2506 memcpy(ptr3+len1, ptr2, len2);
2507 TERM_FILL(&ptr3[len1+len2], termlen);
2508
2509 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2511 RB_GC_GUARD(str1);
2512 RB_GC_GUARD(str2);
2513 return str3;
2514}
2515
2516/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2517VALUE
2518rb_str_opt_plus(VALUE str1, VALUE str2)
2519{
2522 long len1, len2;
2523 MAYBE_UNUSED(char) *ptr1, *ptr2;
2524 RSTRING_GETMEM(str1, ptr1, len1);
2525 RSTRING_GETMEM(str2, ptr2, len2);
2526 int enc1 = rb_enc_get_index(str1);
2527 int enc2 = rb_enc_get_index(str2);
2528
2529 if (enc1 < 0) {
2530 return Qundef;
2531 }
2532 else if (enc2 < 0) {
2533 return Qundef;
2534 }
2535 else if (enc1 != enc2) {
2536 return Qundef;
2537 }
2538 else if (len1 > LONG_MAX - len2) {
2539 return Qundef;
2540 }
2541 else {
2542 return rb_str_plus(str1, str2);
2543 }
2544
2545}
2546
2547/*
2548 * call-seq:
2549 * self * n -> new_string
2550 *
2551 * Returns a new string containing +n+ copies of +self+:
2552 *
2553 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2554 * 'No!' * 0 # => ""
2555 *
2556 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2557 */
2558
2559VALUE
2561{
2562 VALUE str2;
2563 long n, len;
2564 char *ptr2;
2565 int termlen;
2566
2567 if (times == INT2FIX(1)) {
2568 return str_duplicate(rb_cString, str);
2569 }
2570 if (times == INT2FIX(0)) {
2571 str2 = str_alloc_embed(rb_cString, 0);
2572 rb_enc_copy(str2, str);
2573 return str2;
2574 }
2575 len = NUM2LONG(times);
2576 if (len < 0) {
2577 rb_raise(rb_eArgError, "negative argument");
2578 }
2579 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2580 if (STR_EMBEDDABLE_P(len, 1)) {
2581 str2 = str_alloc_embed(rb_cString, len + 1);
2582 memset(RSTRING_PTR(str2), 0, len + 1);
2583 }
2584 else {
2585 str2 = str_alloc_heap(rb_cString);
2586 RSTRING(str2)->as.heap.aux.capa = len;
2587 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2588 }
2589 STR_SET_LEN(str2, len);
2590 rb_enc_copy(str2, str);
2591 return str2;
2592 }
2593 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2594 rb_raise(rb_eArgError, "argument too big");
2595 }
2596
2597 len *= RSTRING_LEN(str);
2598 termlen = TERM_LEN(str);
2599 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2600 ptr2 = RSTRING_PTR(str2);
2601 if (len) {
2602 n = RSTRING_LEN(str);
2603 memcpy(ptr2, RSTRING_PTR(str), n);
2604 while (n <= len/2) {
2605 memcpy(ptr2 + n, ptr2, n);
2606 n *= 2;
2607 }
2608 memcpy(ptr2 + n, ptr2, len-n);
2609 }
2610 STR_SET_LEN(str2, len);
2611 TERM_FILL(&ptr2[len], termlen);
2612 rb_enc_cr_str_copy_for_substr(str2, str);
2613
2614 return str2;
2615}
2616
2617/*
2618 * call-seq:
2619 * self % object -> new_string
2620 *
2621 * Returns the result of formatting +object+ into the format specifications
2622 * contained in +self+
2623 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2624 *
2625 * '%05d' % 123 # => "00123"
2626 *
2627 * If +self+ contains multiple format specifications,
2628 * +object+ must be an array or hash containing the objects to be formatted:
2629 *
2630 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2631 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2632 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2633 *
2634 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2635 */
2636
2637static VALUE
2638rb_str_format_m(VALUE str, VALUE arg)
2639{
2640 VALUE tmp = rb_check_array_type(arg);
2641
2642 if (!NIL_P(tmp)) {
2643 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2644 RB_GC_GUARD(tmp);
2645 return result;
2646 }
2647 return rb_str_format(1, &arg, str);
2648}
2649
2650static inline void
2651rb_check_lockedtmp(VALUE str)
2652{
2653 if (FL_TEST(str, STR_TMPLOCK)) {
2654 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2655 }
2656}
2657
2658// If none of these flags are set, we know we have an modifiable string.
2659// If any is set, we need to do more detailed checks.
2660#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2661static inline void
2662str_modifiable(VALUE str)
2663{
2665
2666 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2667 if (CHILLED_STRING_P(str)) {
2668 CHILLED_STRING_MUTATED(str);
2669 }
2670 rb_check_lockedtmp(str);
2671 rb_check_frozen(str);
2672 }
2673}
2674
2675static inline int
2676str_dependent_p(VALUE str)
2677{
2678 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2679 return FALSE;
2680 }
2681 else {
2682 return TRUE;
2683 }
2684}
2685
2686// If none of these flags are set, we know we have an independent string.
2687// If any is set, we need to do more detailed checks.
2688#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2689static inline int
2690str_independent(VALUE str)
2691{
2693
2694 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2695 str_modifiable(str);
2696 return !str_dependent_p(str);
2697 }
2698 return TRUE;
2699}
2700
2701static void
2702str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2703{
2705
2706 char *ptr;
2707 char *oldptr;
2708 long capa = len + expand;
2709
2710 if (len > capa) len = capa;
2711
2712 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2713 ptr = RSTRING(str)->as.heap.ptr;
2714 STR_SET_EMBED(str);
2715 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2716 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2717 STR_SET_LEN(str, len);
2718 return;
2719 }
2720
2721 ptr = ALLOC_N(char, (size_t)capa + termlen);
2722 oldptr = RSTRING_PTR(str);
2723 if (oldptr) {
2724 memcpy(ptr, oldptr, len);
2725 }
2726 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2727 xfree(oldptr);
2728 }
2729 STR_SET_NOEMBED(str);
2730 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2731 TERM_FILL(ptr + len, termlen);
2732 RSTRING(str)->as.heap.ptr = ptr;
2733 STR_SET_LEN(str, len);
2734 RSTRING(str)->as.heap.aux.capa = capa;
2735}
2736
2737void
2738rb_str_modify(VALUE str)
2739{
2740 if (!str_independent(str))
2741 str_make_independent(str);
2743}
2744
2745void
2747{
2749
2750 int termlen = TERM_LEN(str);
2751 long len = RSTRING_LEN(str);
2752
2753 if (expand < 0) {
2754 rb_raise(rb_eArgError, "negative expanding string size");
2755 }
2756 if (expand >= LONG_MAX - len) {
2757 rb_raise(rb_eArgError, "string size too big");
2758 }
2759
2760 if (!str_independent(str)) {
2761 str_make_independent_expand(str, len, expand, termlen);
2762 }
2763 else if (expand > 0) {
2764 RESIZE_CAPA_TERM(str, len + expand, termlen);
2765 }
2767}
2768
2769/* As rb_str_modify(), but don't clear coderange */
2770static void
2771str_modify_keep_cr(VALUE str)
2772{
2773 if (!str_independent(str))
2774 str_make_independent(str);
2776 /* Force re-scan later */
2778}
2779
2780static inline void
2781str_discard(VALUE str)
2782{
2783 str_modifiable(str);
2784 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2785 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2786 RSTRING(str)->as.heap.ptr = 0;
2787 STR_SET_LEN(str, 0);
2788 }
2789}
2790
2791void
2793{
2794 int encindex = rb_enc_get_index(str);
2795
2796 if (RB_UNLIKELY(encindex == -1)) {
2797 rb_raise(rb_eTypeError, "not encoding capable object");
2798 }
2799
2800 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2801 return;
2802 }
2803
2804 rb_encoding *enc = rb_enc_from_index(encindex);
2805 if (!rb_enc_asciicompat(enc)) {
2806 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2807 }
2808}
2809
2810VALUE
2812{
2814
2815 VALUE s = *ptr;
2816 if (!RB_TYPE_P(s, T_STRING)) {
2817 s = rb_str_to_str(s);
2818 *ptr = s;
2819 }
2820 return s;
2821}
2822
2823char *
2825{
2826 VALUE str = rb_string_value(ptr);
2827 return RSTRING_PTR(str);
2828}
2829
2830static int
2831zero_filled(const char *s, int n)
2832{
2833 for (; n > 0; --n) {
2834 if (*s++) return 0;
2835 }
2836 return 1;
2837}
2838
2839static const char *
2840str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2841{
2842 const char *e = s + len;
2843
2844 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2845 if (zero_filled(s, minlen)) return s;
2846 }
2847 return 0;
2848}
2849
2850static char *
2851str_fill_term(VALUE str, char *s, long len, int termlen)
2852{
2853 /* This function assumes that (capa + termlen) bytes of memory
2854 * is allocated, like many other functions in this file.
2855 */
2856 if (str_dependent_p(str)) {
2857 if (!zero_filled(s + len, termlen))
2858 str_make_independent_expand(str, len, 0L, termlen);
2859 }
2860 else {
2861 TERM_FILL(s + len, termlen);
2862 return s;
2863 }
2864 return RSTRING_PTR(str);
2865}
2866
2867void
2868rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2869{
2870 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2871 long len = RSTRING_LEN(str);
2872
2873 RUBY_ASSERT(capa >= len);
2874 if (capa - len < termlen) {
2875 rb_check_lockedtmp(str);
2876 str_make_independent_expand(str, len, 0L, termlen);
2877 }
2878 else if (str_dependent_p(str)) {
2879 if (termlen > oldtermlen)
2880 str_make_independent_expand(str, len, 0L, termlen);
2881 }
2882 else {
2883 if (!STR_EMBED_P(str)) {
2884 /* modify capa instead of realloc */
2885 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2886 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2887 }
2888 if (termlen > oldtermlen) {
2889 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2890 }
2891 }
2892
2893 return;
2894}
2895
2896static char *
2897str_null_check(VALUE str, int *w)
2898{
2899 char *s = RSTRING_PTR(str);
2900 long len = RSTRING_LEN(str);
2901 rb_encoding *enc = rb_enc_get(str);
2902 const int minlen = rb_enc_mbminlen(enc);
2903
2904 if (minlen > 1) {
2905 *w = 1;
2906 if (str_null_char(s, len, minlen, enc)) {
2907 return NULL;
2908 }
2909 return str_fill_term(str, s, len, minlen);
2910 }
2911 *w = 0;
2912 if (!s || memchr(s, 0, len)) {
2913 return NULL;
2914 }
2915 if (s[len]) {
2916 s = str_fill_term(str, s, len, minlen);
2917 }
2918 return s;
2919}
2920
2921char *
2922rb_str_to_cstr(VALUE str)
2923{
2924 int w;
2925 return str_null_check(str, &w);
2926}
2927
2928char *
2930{
2931 VALUE str = rb_string_value(ptr);
2932 int w;
2933 char *s = str_null_check(str, &w);
2934 if (!s) {
2935 if (w) {
2936 rb_raise(rb_eArgError, "string contains null char");
2937 }
2938 rb_raise(rb_eArgError, "string contains null byte");
2939 }
2940 return s;
2941}
2942
2943char *
2944rb_str_fill_terminator(VALUE str, const int newminlen)
2945{
2946 char *s = RSTRING_PTR(str);
2947 long len = RSTRING_LEN(str);
2948 return str_fill_term(str, s, len, newminlen);
2949}
2950
2951VALUE
2953{
2954 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2955 return str;
2956}
2957
2958/*
2959 * call-seq:
2960 * String.try_convert(object) -> object, new_string, or nil
2961 *
2962 * Attempts to convert the given +object+ to a string.
2963 *
2964 * If +object+ is already a string, returns +object+, unmodified.
2965 *
2966 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2967 * calls <tt>object.to_str</tt> and returns the result.
2968 *
2969 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2970 *
2971 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2972 */
2973static VALUE
2974rb_str_s_try_convert(VALUE dummy, VALUE str)
2975{
2976 return rb_check_string_type(str);
2977}
2978
2979static char*
2980str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2981{
2982 long nth = *nthp;
2983 if (rb_enc_mbmaxlen(enc) == 1) {
2984 p += nth;
2985 }
2986 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2987 p += nth * rb_enc_mbmaxlen(enc);
2988 }
2989 else if (rb_enc_asciicompat(enc)) {
2990 const char *p2, *e2;
2991 int n;
2992
2993 while (p < e && 0 < nth) {
2994 e2 = p + nth;
2995 if (e < e2) {
2996 *nthp = nth;
2997 return (char *)e;
2998 }
2999 if (ISASCII(*p)) {
3000 p2 = search_nonascii(p, e2);
3001 if (!p2) {
3002 nth -= e2 - p;
3003 *nthp = nth;
3004 return (char *)e2;
3005 }
3006 nth -= p2 - p;
3007 p = p2;
3008 }
3009 n = rb_enc_mbclen(p, e, enc);
3010 p += n;
3011 nth--;
3012 }
3013 *nthp = nth;
3014 if (nth != 0) {
3015 return (char *)e;
3016 }
3017 return (char *)p;
3018 }
3019 else {
3020 while (p < e && nth--) {
3021 p += rb_enc_mbclen(p, e, enc);
3022 }
3023 }
3024 if (p > e) p = e;
3025 *nthp = nth;
3026 return (char*)p;
3027}
3028
3029char*
3030rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3031{
3032 return str_nth_len(p, e, &nth, enc);
3033}
3034
3035static char*
3036str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3037{
3038 if (singlebyte)
3039 p += nth;
3040 else {
3041 p = str_nth_len(p, e, &nth, enc);
3042 }
3043 if (!p) return 0;
3044 if (p > e) p = e;
3045 return (char *)p;
3046}
3047
3048/* char offset to byte offset */
3049static long
3050str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3051{
3052 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3053 if (!pp) return e - p;
3054 return pp - p;
3055}
3056
3057long
3058rb_str_offset(VALUE str, long pos)
3059{
3060 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3061 STR_ENC_GET(str), single_byte_optimizable(str));
3062}
3063
3064#ifdef NONASCII_MASK
3065static char *
3066str_utf8_nth(const char *p, const char *e, long *nthp)
3067{
3068 long nth = *nthp;
3069 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3070 const uintptr_t *s, *t;
3071 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3072 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3073 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3074 while (p < (const char *)s) {
3075 if (is_utf8_lead_byte(*p)) nth--;
3076 p++;
3077 }
3078 do {
3079 nth -= count_utf8_lead_bytes_with_word(s);
3080 s++;
3081 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3082 p = (char *)s;
3083 }
3084 while (p < e) {
3085 if (is_utf8_lead_byte(*p)) {
3086 if (nth == 0) break;
3087 nth--;
3088 }
3089 p++;
3090 }
3091 *nthp = nth;
3092 return (char *)p;
3093}
3094
3095static long
3096str_utf8_offset(const char *p, const char *e, long nth)
3097{
3098 const char *pp = str_utf8_nth(p, e, &nth);
3099 return pp - p;
3100}
3101#endif
3102
3103/* byte offset to char offset */
3104long
3105rb_str_sublen(VALUE str, long pos)
3106{
3107 if (single_byte_optimizable(str) || pos < 0)
3108 return pos;
3109 else {
3110 char *p = RSTRING_PTR(str);
3111 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3112 }
3113}
3114
3115static VALUE
3116str_subseq(VALUE str, long beg, long len)
3117{
3118 VALUE str2;
3119
3120 RUBY_ASSERT(beg >= 0);
3121 RUBY_ASSERT(len >= 0);
3122 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3123
3124 const int termlen = TERM_LEN(str);
3125 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3126 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3127 RB_GC_GUARD(str);
3128 return str2;
3129 }
3130
3131 str2 = str_alloc_heap(rb_cString);
3132 if (str_embed_capa(str2) >= len + termlen) {
3133 char *ptr2 = RSTRING(str2)->as.embed.ary;
3134 STR_SET_EMBED(str2);
3135 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3136 TERM_FILL(ptr2+len, termlen);
3137
3138 STR_SET_LEN(str2, len);
3139 RB_GC_GUARD(str);
3140 }
3141 else {
3142 str_replace_shared(str2, str);
3143 RUBY_ASSERT(!STR_EMBED_P(str2));
3144 ENC_CODERANGE_CLEAR(str2);
3145 RSTRING(str2)->as.heap.ptr += beg;
3146 if (RSTRING_LEN(str2) > len) {
3147 STR_SET_LEN(str2, len);
3148 }
3149 }
3150
3151 return str2;
3152}
3153
3154VALUE
3155rb_str_subseq(VALUE str, long beg, long len)
3156{
3157 VALUE str2 = str_subseq(str, beg, len);
3158 rb_enc_cr_str_copy_for_substr(str2, str);
3159 return str2;
3160}
3161
3162char *
3163rb_str_subpos(VALUE str, long beg, long *lenp)
3164{
3165 long len = *lenp;
3166 long slen = -1L;
3167 const long blen = RSTRING_LEN(str);
3168 rb_encoding *enc = STR_ENC_GET(str);
3169 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3170
3171 if (len < 0) return 0;
3172 if (beg < 0 && -beg < 0) return 0;
3173 if (!blen) {
3174 len = 0;
3175 }
3176 if (single_byte_optimizable(str)) {
3177 if (beg > blen) return 0;
3178 if (beg < 0) {
3179 beg += blen;
3180 if (beg < 0) return 0;
3181 }
3182 if (len > blen - beg)
3183 len = blen - beg;
3184 if (len < 0) return 0;
3185 p = s + beg;
3186 goto end;
3187 }
3188 if (beg < 0) {
3189 if (len > -beg) len = -beg;
3190 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3191 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3192 beg = -beg;
3193 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3194 p = e;
3195 if (!p) return 0;
3196 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3197 if (!p) return 0;
3198 len = e - p;
3199 goto end;
3200 }
3201 else {
3202 slen = str_strlen(str, enc);
3203 beg += slen;
3204 if (beg < 0) return 0;
3205 p = s + beg;
3206 if (len == 0) goto end;
3207 }
3208 }
3209 else if (beg > 0 && beg > blen) {
3210 return 0;
3211 }
3212 if (len == 0) {
3213 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3214 p = s + beg;
3215 }
3216#ifdef NONASCII_MASK
3217 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3218 enc == rb_utf8_encoding()) {
3219 p = str_utf8_nth(s, e, &beg);
3220 if (beg > 0) return 0;
3221 len = str_utf8_offset(p, e, len);
3222 }
3223#endif
3224 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3225 int char_sz = rb_enc_mbmaxlen(enc);
3226
3227 p = s + beg * char_sz;
3228 if (p > e) {
3229 return 0;
3230 }
3231 else if (len * char_sz > e - p)
3232 len = e - p;
3233 else
3234 len *= char_sz;
3235 }
3236 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3237 if (beg > 0) return 0;
3238 len = 0;
3239 }
3240 else {
3241 len = str_offset(p, e, len, enc, 0);
3242 }
3243 end:
3244 *lenp = len;
3245 RB_GC_GUARD(str);
3246 return p;
3247}
3248
3249static VALUE str_substr(VALUE str, long beg, long len, int empty);
3250
3251VALUE
3252rb_str_substr(VALUE str, long beg, long len)
3253{
3254 return str_substr(str, beg, len, TRUE);
3255}
3256
3257VALUE
3258rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3259{
3260 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3261}
3262
3263static VALUE
3264str_substr(VALUE str, long beg, long len, int empty)
3265{
3266 char *p = rb_str_subpos(str, beg, &len);
3267
3268 if (!p) return Qnil;
3269 if (!len && !empty) return Qnil;
3270
3271 beg = p - RSTRING_PTR(str);
3272
3273 VALUE str2 = str_subseq(str, beg, len);
3274 rb_enc_cr_str_copy_for_substr(str2, str);
3275 return str2;
3276}
3277
3278/* :nodoc: */
3279VALUE
3281{
3282 if (CHILLED_STRING_P(str)) {
3283 FL_UNSET_RAW(str, STR_CHILLED);
3284 }
3285
3286 if (OBJ_FROZEN(str)) return str;
3287 rb_str_resize(str, RSTRING_LEN(str));
3288 return rb_obj_freeze(str);
3289}
3290
3291/*
3292 * call-seq:
3293 * +string -> new_string or self
3294 *
3295 * Returns +self+ if +self+ is not frozen and can be mutated
3296 * without warning issuance.
3297 *
3298 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3299 *
3300 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3301 */
3302static VALUE
3303str_uplus(VALUE str)
3304{
3305 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3306 return rb_str_dup(str);
3307 }
3308 else {
3309 return str;
3310 }
3311}
3312
3313/*
3314 * call-seq:
3315 * -self -> frozen_string
3316 *
3317 * Returns a frozen string equal to +self+.
3318 *
3319 * The returned string is +self+ if and only if all of the following are true:
3320 *
3321 * - +self+ is already frozen.
3322 * - +self+ is an instance of \String (rather than of a subclass of \String)
3323 * - +self+ has no instance variables set on it.
3324 *
3325 * Otherwise, the returned string is a frozen copy of +self+.
3326 *
3327 * Returning +self+, when possible, saves duplicating +self+;
3328 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3329 *
3330 * It may also save duplicating other, already-existing, strings:
3331 *
3332 * s0 = 'foo'
3333 * s1 = 'foo'
3334 * s0.object_id == s1.object_id # => false
3335 * (-s0).object_id == (-s1).object_id # => true
3336 *
3337 * Note that method #-@ is convenient for defining a constant:
3338 *
3339 * FileName = -'config/database.yml'
3340 *
3341 * While its alias #dedup is better suited for chaining:
3342 *
3343 * 'foo'.dedup.gsub!('o')
3344 *
3345 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3346 */
3347static VALUE
3348str_uminus(VALUE str)
3349{
3350 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3351 str = rb_str_dup(str);
3352 }
3353 return rb_fstring(str);
3354}
3355
3356RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3357#define rb_str_dup_frozen rb_str_new_frozen
3358
3359VALUE
3361{
3362 rb_check_frozen(str);
3363 if (FL_TEST(str, STR_TMPLOCK)) {
3364 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3365 }
3366 FL_SET(str, STR_TMPLOCK);
3367 return str;
3368}
3369
3370VALUE
3372{
3373 rb_check_frozen(str);
3374 if (!FL_TEST(str, STR_TMPLOCK)) {
3375 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3376 }
3377 FL_UNSET(str, STR_TMPLOCK);
3378 return str;
3379}
3380
3381VALUE
3382rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3383{
3384 rb_str_locktmp(str);
3385 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3386}
3387
3388void
3390{
3392
3393 long capa;
3394 const int termlen = TERM_LEN(str);
3395
3396 str_modifiable(str);
3397 if (STR_SHARED_P(str)) {
3398 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3399 }
3400 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3401 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3402 }
3403
3404 int cr = ENC_CODERANGE(str);
3405 if (len == 0) {
3406 /* Empty string does not contain non-ASCII */
3408 }
3409 else if (cr == ENC_CODERANGE_UNKNOWN) {
3410 /* Leave unknown. */
3411 }
3412 else if (len > RSTRING_LEN(str)) {
3413 if (ENC_CODERANGE_CLEAN_P(cr)) {
3414 /* Update the coderange regarding the extended part. */
3415 const char *const prev_end = RSTRING_END(str);
3416 const char *const new_end = RSTRING_PTR(str) + len;
3417 rb_encoding *enc = rb_enc_get(str);
3418 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3419 ENC_CODERANGE_SET(str, cr);
3420 }
3421 else if (cr == ENC_CODERANGE_BROKEN) {
3422 /* May be valid now, by appended part. */
3424 }
3425 }
3426 else if (len < RSTRING_LEN(str)) {
3427 if (cr != ENC_CODERANGE_7BIT) {
3428 /* ASCII-only string is keeping after truncated. Valid
3429 * and broken may be invalid or valid, leave unknown. */
3431 }
3432 }
3433
3434 STR_SET_LEN(str, len);
3435 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3436}
3437
3438VALUE
3439rb_str_resize(VALUE str, long len)
3440{
3441 if (len < 0) {
3442 rb_raise(rb_eArgError, "negative string size (or size too big)");
3443 }
3444
3445 int independent = str_independent(str);
3446 long slen = RSTRING_LEN(str);
3447 const int termlen = TERM_LEN(str);
3448
3449 if (slen > len || (termlen != 1 && slen < len)) {
3451 }
3452
3453 {
3454 long capa;
3455 if (STR_EMBED_P(str)) {
3456 if (len == slen) return str;
3457 if (str_embed_capa(str) >= len + termlen) {
3458 STR_SET_LEN(str, len);
3459 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3460 return str;
3461 }
3462 str_make_independent_expand(str, slen, len - slen, termlen);
3463 }
3464 else if (str_embed_capa(str) >= len + termlen) {
3465 char *ptr = STR_HEAP_PTR(str);
3466 STR_SET_EMBED(str);
3467 if (slen > len) slen = len;
3468 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3469 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3470 STR_SET_LEN(str, len);
3471 if (independent) ruby_xfree(ptr);
3472 return str;
3473 }
3474 else if (!independent) {
3475 if (len == slen) return str;
3476 str_make_independent_expand(str, slen, len - slen, termlen);
3477 }
3478 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3479 (capa - len) > (len < 1024 ? len : 1024)) {
3480 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3481 (size_t)len + termlen, STR_HEAP_SIZE(str));
3482 RSTRING(str)->as.heap.aux.capa = len;
3483 }
3484 else if (len == slen) return str;
3485 STR_SET_LEN(str, len);
3486 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3487 }
3488 return str;
3489}
3490
3491static void
3492str_ensure_available_capa(VALUE str, long len)
3493{
3494 str_modify_keep_cr(str);
3495
3496 const int termlen = TERM_LEN(str);
3497 long olen = RSTRING_LEN(str);
3498
3499 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3500 rb_raise(rb_eArgError, "string sizes too big");
3501 }
3502
3503 long total = olen + len;
3504 long capa = str_capacity(str, termlen);
3505
3506 if (capa < total) {
3507 if (total >= LONG_MAX / 2) {
3508 capa = total;
3509 }
3510 while (total > capa) {
3511 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3512 }
3513 RESIZE_CAPA_TERM(str, capa, termlen);
3514 }
3515}
3516
3517static VALUE
3518str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3519{
3520 if (keep_cr) {
3521 str_modify_keep_cr(str);
3522 }
3523 else {
3524 rb_str_modify(str);
3525 }
3526 if (len == 0) return 0;
3527
3528 long total, olen, off = -1;
3529 char *sptr;
3530 const int termlen = TERM_LEN(str);
3531
3532 RSTRING_GETMEM(str, sptr, olen);
3533 if (ptr >= sptr && ptr <= sptr + olen) {
3534 off = ptr - sptr;
3535 }
3536
3537 long capa = str_capacity(str, termlen);
3538
3539 if (olen > LONG_MAX - len) {
3540 rb_raise(rb_eArgError, "string sizes too big");
3541 }
3542 total = olen + len;
3543 if (capa < total) {
3544 if (total >= LONG_MAX / 2) {
3545 capa = total;
3546 }
3547 while (total > capa) {
3548 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3549 }
3550 RESIZE_CAPA_TERM(str, capa, termlen);
3551 sptr = RSTRING_PTR(str);
3552 }
3553 if (off != -1) {
3554 ptr = sptr + off;
3555 }
3556 memcpy(sptr + olen, ptr, len);
3557 STR_SET_LEN(str, total);
3558 TERM_FILL(sptr + total, termlen); /* sentinel */
3559
3560 return str;
3561}
3562
3563#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3564#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3565
3566VALUE
3567rb_str_cat(VALUE str, const char *ptr, long len)
3568{
3569 if (len == 0) return str;
3570 if (len < 0) {
3571 rb_raise(rb_eArgError, "negative string size (or size too big)");
3572 }
3573 return str_buf_cat(str, ptr, len);
3574}
3575
3576VALUE
3577rb_str_cat_cstr(VALUE str, const char *ptr)
3578{
3579 must_not_null(ptr);
3580 return rb_str_buf_cat(str, ptr, strlen(ptr));
3581}
3582
3583static void
3584rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3585{
3586 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3587
3588 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3589 if (UNLIKELY(!str_independent(str))) {
3590 str_make_independent(str);
3591 }
3592
3593 long string_length = -1;
3594 const int null_terminator_length = 1;
3595 char *sptr;
3596 RSTRING_GETMEM(str, sptr, string_length);
3597
3598 // Ensure the resulting string wouldn't be too long.
3599 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3600 rb_raise(rb_eArgError, "string sizes too big");
3601 }
3602
3603 long string_capacity = str_capacity(str, null_terminator_length);
3604
3605 // Get the code range before any modifications since those might clear the code range.
3606 int cr = ENC_CODERANGE(str);
3607
3608 // Check if the string has spare string_capacity to write the new byte.
3609 if (LIKELY(string_capacity >= string_length + 1)) {
3610 // In fast path we can write the new byte and note the string's new length.
3611 sptr[string_length] = byte;
3612 STR_SET_LEN(str, string_length + 1);
3613 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3614 }
3615 else {
3616 // If there's not enough string_capacity, make a call into the general string concatenation function.
3617 str_buf_cat(str, (char *)&byte, 1);
3618 }
3619
3620 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3621 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3622 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3623 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3624 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3625 if (ISASCII(byte)) {
3627 }
3628 else {
3630
3631 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3632 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3633 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3634 }
3635 }
3636 }
3637}
3638
3639RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3640RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3641RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3642
3643static VALUE
3644rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3645 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3646{
3647 int str_encindex = ENCODING_GET(str);
3648 int res_encindex;
3649 int str_cr, res_cr;
3650 rb_encoding *str_enc, *ptr_enc;
3651
3652 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3653
3654 if (str_encindex == ptr_encindex) {
3655 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3656 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3657 }
3658 }
3659 else {
3660 str_enc = rb_enc_from_index(str_encindex);
3661 ptr_enc = rb_enc_from_index(ptr_encindex);
3662 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3663 if (len == 0)
3664 return str;
3665 if (RSTRING_LEN(str) == 0) {
3666 rb_str_buf_cat(str, ptr, len);
3667 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3668 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3669 return str;
3670 }
3671 goto incompatible;
3672 }
3673 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3674 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3675 }
3676 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3677 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3678 str_cr = rb_enc_str_coderange(str);
3679 }
3680 }
3681 }
3682 if (ptr_cr_ret)
3683 *ptr_cr_ret = ptr_cr;
3684
3685 if (str_encindex != ptr_encindex &&
3686 str_cr != ENC_CODERANGE_7BIT &&
3687 ptr_cr != ENC_CODERANGE_7BIT) {
3688 str_enc = rb_enc_from_index(str_encindex);
3689 ptr_enc = rb_enc_from_index(ptr_encindex);
3690 goto incompatible;
3691 }
3692
3693 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3694 res_encindex = str_encindex;
3695 res_cr = ENC_CODERANGE_UNKNOWN;
3696 }
3697 else if (str_cr == ENC_CODERANGE_7BIT) {
3698 if (ptr_cr == ENC_CODERANGE_7BIT) {
3699 res_encindex = str_encindex;
3700 res_cr = ENC_CODERANGE_7BIT;
3701 }
3702 else {
3703 res_encindex = ptr_encindex;
3704 res_cr = ptr_cr;
3705 }
3706 }
3707 else if (str_cr == ENC_CODERANGE_VALID) {
3708 res_encindex = str_encindex;
3709 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3710 res_cr = str_cr;
3711 else
3712 res_cr = ptr_cr;
3713 }
3714 else { /* str_cr == ENC_CODERANGE_BROKEN */
3715 res_encindex = str_encindex;
3716 res_cr = str_cr;
3717 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3718 }
3719
3720 if (len < 0) {
3721 rb_raise(rb_eArgError, "negative string size (or size too big)");
3722 }
3723 str_buf_cat(str, ptr, len);
3724 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3725 return str;
3726
3727 incompatible:
3728 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3729 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3731}
3732
3733VALUE
3734rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3735{
3736 return rb_enc_cr_str_buf_cat(str, ptr, len,
3737 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3738}
3739
3740VALUE
3741rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3742{
3743 /* ptr must reference NUL terminated ASCII string. */
3744 int encindex = ENCODING_GET(str);
3745 rb_encoding *enc = rb_enc_from_index(encindex);
3746 if (rb_enc_asciicompat(enc)) {
3747 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3748 encindex, ENC_CODERANGE_7BIT, 0);
3749 }
3750 else {
3751 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3752 while (*ptr) {
3753 unsigned int c = (unsigned char)*ptr;
3754 int len = rb_enc_codelen(c, enc);
3755 rb_enc_mbcput(c, buf, enc);
3756 rb_enc_cr_str_buf_cat(str, buf, len,
3757 encindex, ENC_CODERANGE_VALID, 0);
3758 ptr++;
3759 }
3760 return str;
3761 }
3762}
3763
3764VALUE
3766{
3767 int str2_cr = rb_enc_str_coderange(str2);
3768
3769 if (str_enc_fastpath(str)) {
3770 switch (str2_cr) {
3771 case ENC_CODERANGE_7BIT:
3772 // If RHS is 7bit we can do simple concatenation
3773 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3774 RB_GC_GUARD(str2);
3775 return str;
3777 // If RHS is valid, we can do simple concatenation if encodings are the same
3778 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3779 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3780 int str_cr = ENC_CODERANGE(str);
3781 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3782 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3783 }
3784 RB_GC_GUARD(str2);
3785 return str;
3786 }
3787 }
3788 }
3789
3790 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3791 ENCODING_GET(str2), str2_cr, &str2_cr);
3792
3793 ENC_CODERANGE_SET(str2, str2_cr);
3794
3795 return str;
3796}
3797
3798VALUE
3800{
3801 StringValue(str2);
3802 return rb_str_buf_append(str, str2);
3803}
3804
3805VALUE
3806rb_str_concat_literals(size_t num, const VALUE *strary)
3807{
3808 VALUE str;
3809 size_t i, s = 0;
3810 unsigned long len = 1;
3811
3812 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3813 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3814
3815 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3816 str = rb_str_buf_new(len);
3817 str_enc_copy_direct(str, strary[0]);
3818
3819 for (i = s; i < num; ++i) {
3820 const VALUE v = strary[i];
3821 int encidx = ENCODING_GET(v);
3822
3823 rb_str_buf_append(str, v);
3824 if (encidx != ENCINDEX_US_ASCII) {
3825 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3826 rb_enc_set_index(str, encidx);
3827 }
3828 }
3829 return str;
3830}
3831
3832/*
3833 * call-seq:
3834 * concat(*objects) -> string
3835 *
3836 * :include: doc/string/concat.rdoc
3837 */
3838static VALUE
3839rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3840{
3841 str_modifiable(str);
3842
3843 if (argc == 1) {
3844 return rb_str_concat(str, argv[0]);
3845 }
3846 else if (argc > 1) {
3847 int i;
3848 VALUE arg_str = rb_str_tmp_new(0);
3849 rb_enc_copy(arg_str, str);
3850 for (i = 0; i < argc; i++) {
3851 rb_str_concat(arg_str, argv[i]);
3852 }
3853 rb_str_buf_append(str, arg_str);
3854 }
3855
3856 return str;
3857}
3858
3859/*
3860 * call-seq:
3861 * append_as_bytes(*objects) -> self
3862 *
3863 * Concatenates each object in +objects+ into +self+; returns +self+;
3864 * performs no encoding validation or conversion:
3865 *
3866 * s = 'foo'
3867 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3868 * s.valid_encoding? # => false
3869 * s.append_as_bytes("\xAC 12")
3870 * s.valid_encoding? # => true
3871 *
3872 * When a given object is an integer,
3873 * the value is considered an 8-bit byte;
3874 * if the integer occupies more than one byte (i.e,. is greater than 255),
3875 * appends only the low-order byte (similar to String#setbyte):
3876 *
3877 * s = ""
3878 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3879 * s.bytesize # => 2
3880 *
3881 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3882 */
3883
3884VALUE
3885rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3886{
3887 long needed_capacity = 0;
3888 volatile VALUE t0;
3889 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3890
3891 for (int index = 0; index < argc; index++) {
3892 VALUE obj = argv[index];
3893 enum ruby_value_type type = types[index] = rb_type(obj);
3894 switch (type) {
3895 case T_FIXNUM:
3896 case T_BIGNUM:
3897 needed_capacity++;
3898 break;
3899 case T_STRING:
3900 needed_capacity += RSTRING_LEN(obj);
3901 break;
3902 default:
3903 rb_raise(
3905 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3906 rb_obj_class(obj)
3907 );
3908 break;
3909 }
3910 }
3911
3912 str_ensure_available_capa(str, needed_capacity);
3913 char *sptr = RSTRING_END(str);
3914
3915 for (int index = 0; index < argc; index++) {
3916 VALUE obj = argv[index];
3917 enum ruby_value_type type = types[index];
3918 switch (type) {
3919 case T_FIXNUM:
3920 case T_BIGNUM: {
3921 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3922 char byte = (char)(NUM2INT(obj) & 0xFF);
3923 *sptr = byte;
3924 sptr++;
3925 break;
3926 }
3927 case T_STRING: {
3928 const char *ptr;
3929 long len;
3930 RSTRING_GETMEM(obj, ptr, len);
3931 memcpy(sptr, ptr, len);
3932 sptr += len;
3933 break;
3934 }
3935 default:
3936 rb_bug("append_as_bytes arguments should have been validated");
3937 }
3938 }
3939
3940 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3941 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3942
3943 int cr = ENC_CODERANGE(str);
3944 switch (cr) {
3945 case ENC_CODERANGE_7BIT: {
3946 for (int index = 0; index < argc; index++) {
3947 VALUE obj = argv[index];
3948 enum ruby_value_type type = types[index];
3949 switch (type) {
3950 case T_FIXNUM:
3951 case T_BIGNUM: {
3952 if (!ISASCII(NUM2INT(obj))) {
3953 goto clear_cr;
3954 }
3955 break;
3956 }
3957 case T_STRING: {
3958 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3959 goto clear_cr;
3960 }
3961 break;
3962 }
3963 default:
3964 rb_bug("append_as_bytes arguments should have been validated");
3965 }
3966 }
3967 break;
3968 }
3970 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3971 goto keep_cr;
3972 }
3973 else {
3974 goto clear_cr;
3975 }
3976 break;
3977 default:
3978 goto clear_cr;
3979 break;
3980 }
3981
3982 RB_GC_GUARD(t0);
3983
3984 clear_cr:
3985 // If no fast path was hit, we clear the coderange.
3986 // append_as_bytes is predominantly meant to be used in
3987 // buffering situation, hence it's likely the coderange
3988 // will never be scanned, so it's not worth spending time
3989 // precomputing the coderange except for simple and common
3990 // situations.
3992 keep_cr:
3993 return str;
3994}
3995
3996/*
3997 * call-seq:
3998 * self << object -> self
3999 *
4000 * Appends a string representation of +object+ to +self+;
4001 * returns +self+.
4002 *
4003 * If +object+ is a string, appends it to +self+:
4004 *
4005 * s = 'foo'
4006 * s << 'bar' # => "foobar"
4007 * s # => "foobar"
4008 *
4009 * If +object+ is an integer,
4010 * its value is considered a codepoint;
4011 * converts the value to a character before concatenating:
4012 *
4013 * s = 'foo'
4014 * s << 33 # => "foo!"
4015 *
4016 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4017 * and the encoding of +self+ is Encoding::US_ASCII,
4018 * changes the encoding to Encoding::ASCII_8BIT:
4019 *
4020 * s = 'foo'.encode(Encoding::US_ASCII)
4021 * s.encoding # => #<Encoding:US-ASCII>
4022 * s << 0xff # => "foo\xFF"
4023 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4024 *
4025 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4026 *
4027 * s = 'foo'
4028 * s.encoding # => <Encoding:UTF-8>
4029 * s << 0x00110000 # 1114112 out of char range (RangeError)
4030 * s = 'foo'.encode(Encoding::EUC_JP)
4031 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4032 *
4033 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4034 */
4035VALUE
4037{
4038 unsigned int code;
4039 rb_encoding *enc = STR_ENC_GET(str1);
4040 int encidx;
4041
4042 if (RB_INTEGER_TYPE_P(str2)) {
4043 if (rb_num_to_uint(str2, &code) == 0) {
4044 }
4045 else if (FIXNUM_P(str2)) {
4046 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4047 }
4048 else {
4049 rb_raise(rb_eRangeError, "bignum out of char range");
4050 }
4051 }
4052 else {
4053 return rb_str_append(str1, str2);
4054 }
4055
4056 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4057
4058 if (encidx >= 0) {
4059 rb_str_buf_cat_byte(str1, (unsigned char)code);
4060 }
4061 else {
4062 long pos = RSTRING_LEN(str1);
4063 int cr = ENC_CODERANGE(str1);
4064 int len;
4065 char *buf;
4066
4067 switch (len = rb_enc_codelen(code, enc)) {
4068 case ONIGERR_INVALID_CODE_POINT_VALUE:
4069 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4070 break;
4071 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4072 case 0:
4073 rb_raise(rb_eRangeError, "%u out of char range", code);
4074 break;
4075 }
4076 buf = ALLOCA_N(char, len + 1);
4077 rb_enc_mbcput(code, buf, enc);
4078 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4079 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4080 }
4081 rb_str_resize(str1, pos+len);
4082 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4083 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4085 }
4086 else if (cr == ENC_CODERANGE_BROKEN) {
4088 }
4089 ENC_CODERANGE_SET(str1, cr);
4090 }
4091 return str1;
4092}
4093
4094int
4095rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4096{
4097 int encidx = rb_enc_to_index(enc);
4098
4099 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4100 /* US-ASCII automatically extended to ASCII-8BIT */
4101 if (code > 0xFF) {
4102 rb_raise(rb_eRangeError, "%u out of char range", code);
4103 }
4104 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4105 return ENCINDEX_ASCII_8BIT;
4106 }
4107 return encidx;
4108 }
4109 else {
4110 return -1;
4111 }
4112}
4113
4114/*
4115 * call-seq:
4116 * prepend(*other_strings) -> new_string
4117 *
4118 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4119 *
4120 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4121 *
4122 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4123 *
4124 */
4125
4126static VALUE
4127rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4128{
4129 str_modifiable(str);
4130
4131 if (argc == 1) {
4132 rb_str_update(str, 0L, 0L, argv[0]);
4133 }
4134 else if (argc > 1) {
4135 int i;
4136 VALUE arg_str = rb_str_tmp_new(0);
4137 rb_enc_copy(arg_str, str);
4138 for (i = 0; i < argc; i++) {
4139 rb_str_append(arg_str, argv[i]);
4140 }
4141 rb_str_update(str, 0L, 0L, arg_str);
4142 }
4143
4144 return str;
4145}
4146
4147st_index_t
4149{
4150 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4151 st_index_t precomputed_hash;
4152 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4153
4154 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4155 return precomputed_hash;
4156 }
4157
4158 return str_do_hash(str);
4159}
4160
4161int
4163{
4164 long len1, len2;
4165 const char *ptr1, *ptr2;
4166 RSTRING_GETMEM(str1, ptr1, len1);
4167 RSTRING_GETMEM(str2, ptr2, len2);
4168 return (len1 != len2 ||
4169 !rb_str_comparable(str1, str2) ||
4170 memcmp(ptr1, ptr2, len1) != 0);
4171}
4172
4173/*
4174 * call-seq:
4175 * hash -> integer
4176 *
4177 * :include: doc/string/hash.rdoc
4178 *
4179 */
4180
4181static VALUE
4182rb_str_hash_m(VALUE str)
4183{
4184 st_index_t hval = rb_str_hash(str);
4185 return ST2FIX(hval);
4186}
4187
4188#define lesser(a,b) (((a)>(b))?(b):(a))
4189
4190int
4192{
4193 int idx1, idx2;
4194 int rc1, rc2;
4195
4196 if (RSTRING_LEN(str1) == 0) return TRUE;
4197 if (RSTRING_LEN(str2) == 0) return TRUE;
4198 idx1 = ENCODING_GET(str1);
4199 idx2 = ENCODING_GET(str2);
4200 if (idx1 == idx2) return TRUE;
4201 rc1 = rb_enc_str_coderange(str1);
4202 rc2 = rb_enc_str_coderange(str2);
4203 if (rc1 == ENC_CODERANGE_7BIT) {
4204 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4205 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4206 return TRUE;
4207 }
4208 if (rc2 == ENC_CODERANGE_7BIT) {
4209 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4210 return TRUE;
4211 }
4212 return FALSE;
4213}
4214
4215int
4217{
4218 long len1, len2;
4219 const char *ptr1, *ptr2;
4220 int retval;
4221
4222 if (str1 == str2) return 0;
4223 RSTRING_GETMEM(str1, ptr1, len1);
4224 RSTRING_GETMEM(str2, ptr2, len2);
4225 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4226 if (len1 == len2) {
4227 if (!rb_str_comparable(str1, str2)) {
4228 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4229 return 1;
4230 return -1;
4231 }
4232 return 0;
4233 }
4234 if (len1 > len2) return 1;
4235 return -1;
4236 }
4237 if (retval > 0) return 1;
4238 return -1;
4239}
4240
4241/*
4242 * call-seq:
4243 * self == object -> true or false
4244 *
4245 * Returns whether +object+ is equal to +self+.
4246 *
4247 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4248 *
4249 * s = 'foo'
4250 * s == 'foo' # => true
4251 * s == 'food' # => false
4252 * s == 'FOO' # => false
4253 *
4254 * Returns +false+ if the two strings' encodings are not compatible:
4255 *
4256 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4257 *
4258 * When +object+ is not a string:
4259 *
4260 * - If +object+ responds to method <tt>to_str</tt>,
4261 * <tt>object == self</tt> is called and its return value is returned.
4262 * - If +object+ does not respond to <tt>to_str</tt>,
4263 * +false+ is returned.
4264 *
4265 * Related: {Comparing}[rdoc-ref:String@Comparing].
4266 */
4267
4268VALUE
4270{
4271 if (str1 == str2) return Qtrue;
4272 if (!RB_TYPE_P(str2, T_STRING)) {
4273 if (!rb_respond_to(str2, idTo_str)) {
4274 return Qfalse;
4275 }
4276 return rb_equal(str2, str1);
4277 }
4278 return rb_str_eql_internal(str1, str2);
4279}
4280
4281/*
4282 * call-seq:
4283 * eql?(object) -> true or false
4284 *
4285 * :include: doc/string/eql_p.rdoc
4286 *
4287 */
4288
4289VALUE
4290rb_str_eql(VALUE str1, VALUE str2)
4291{
4292 if (str1 == str2) return Qtrue;
4293 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4294 return rb_str_eql_internal(str1, str2);
4295}
4296
4297/*
4298 * call-seq:
4299 * self <=> other -> -1, 0, 1, or nil
4300 *
4301 * Compares +self+ and +other+,
4302 * evaluating their _contents_, not their _lengths_.
4303 *
4304 * Returns:
4305 *
4306 * - +-1+, if +self+ is smaller.
4307 * - +0+, if the two are equal.
4308 * - +1+, if +self+ is larger.
4309 * - +nil+, if the two are incomparable.
4310 *
4311 * Examples:
4312 *
4313 * 'a' <=> 'b' # => -1
4314 * 'a' <=> 'ab' # => -1
4315 * 'a' <=> 'a' # => 0
4316 * 'b' <=> 'a' # => 1
4317 * 'ab' <=> 'a' # => 1
4318 * 'a' <=> :a # => nil
4319 *
4320 * \Class \String includes module Comparable,
4321 * each of whose methods uses String#<=> for comparison.
4322 *
4323 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4324 */
4325
4326static VALUE
4327rb_str_cmp_m(VALUE str1, VALUE str2)
4328{
4329 int result;
4330 VALUE s = rb_check_string_type(str2);
4331 if (NIL_P(s)) {
4332 return rb_invcmp(str1, str2);
4333 }
4334 result = rb_str_cmp(str1, s);
4335 return INT2FIX(result);
4336}
4337
4338static VALUE str_casecmp(VALUE str1, VALUE str2);
4339static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4340
4341/*
4342 * call-seq:
4343 * casecmp(other_string) -> -1, 0, 1, or nil
4344 *
4345 * Ignoring case, compares +self+ and +other_string+; returns:
4346 *
4347 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4348 * - 0 if the two are equal.
4349 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4350 * - +nil+ if the two are incomparable.
4351 *
4352 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4353 *
4354 * Examples:
4355 *
4356 * 'foo'.casecmp('goo') # => -1
4357 * 'goo'.casecmp('foo') # => 1
4358 * 'foo'.casecmp('food') # => -1
4359 * 'food'.casecmp('foo') # => 1
4360 * 'FOO'.casecmp('foo') # => 0
4361 * 'foo'.casecmp('FOO') # => 0
4362 * 'foo'.casecmp(1) # => nil
4363 *
4364 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4365 */
4366
4367static VALUE
4368rb_str_casecmp(VALUE str1, VALUE str2)
4369{
4370 VALUE s = rb_check_string_type(str2);
4371 if (NIL_P(s)) {
4372 return Qnil;
4373 }
4374 return str_casecmp(str1, s);
4375}
4376
4377static VALUE
4378str_casecmp(VALUE str1, VALUE str2)
4379{
4380 long len;
4381 rb_encoding *enc;
4382 const char *p1, *p1end, *p2, *p2end;
4383
4384 enc = rb_enc_compatible(str1, str2);
4385 if (!enc) {
4386 return Qnil;
4387 }
4388
4389 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4390 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4391 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4392 while (p1 < p1end && p2 < p2end) {
4393 if (*p1 != *p2) {
4394 unsigned int c1 = TOLOWER(*p1 & 0xff);
4395 unsigned int c2 = TOLOWER(*p2 & 0xff);
4396 if (c1 != c2)
4397 return INT2FIX(c1 < c2 ? -1 : 1);
4398 }
4399 p1++;
4400 p2++;
4401 }
4402 }
4403 else {
4404 while (p1 < p1end && p2 < p2end) {
4405 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4406 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4407
4408 if (0 <= c1 && 0 <= c2) {
4409 c1 = TOLOWER(c1);
4410 c2 = TOLOWER(c2);
4411 if (c1 != c2)
4412 return INT2FIX(c1 < c2 ? -1 : 1);
4413 }
4414 else {
4415 int r;
4416 l1 = rb_enc_mbclen(p1, p1end, enc);
4417 l2 = rb_enc_mbclen(p2, p2end, enc);
4418 len = l1 < l2 ? l1 : l2;
4419 r = memcmp(p1, p2, len);
4420 if (r != 0)
4421 return INT2FIX(r < 0 ? -1 : 1);
4422 if (l1 != l2)
4423 return INT2FIX(l1 < l2 ? -1 : 1);
4424 }
4425 p1 += l1;
4426 p2 += l2;
4427 }
4428 }
4429 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4430 if (p1 == p1end) return INT2FIX(-1);
4431 return INT2FIX(1);
4432}
4433
4434/*
4435 * call-seq:
4436 * casecmp?(other_string) -> true, false, or nil
4437 *
4438 * Returns +true+ if +self+ and +other_string+ are equal after
4439 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4440 *
4441 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4442 *
4443 * Examples:
4444 *
4445 * 'foo'.casecmp?('goo') # => false
4446 * 'goo'.casecmp?('foo') # => false
4447 * 'foo'.casecmp?('food') # => false
4448 * 'food'.casecmp?('foo') # => false
4449 * 'FOO'.casecmp?('foo') # => true
4450 * 'foo'.casecmp?('FOO') # => true
4451 * 'foo'.casecmp?(1) # => nil
4452 *
4453 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4454 */
4455
4456static VALUE
4457rb_str_casecmp_p(VALUE str1, VALUE str2)
4458{
4459 VALUE s = rb_check_string_type(str2);
4460 if (NIL_P(s)) {
4461 return Qnil;
4462 }
4463 return str_casecmp_p(str1, s);
4464}
4465
4466static VALUE
4467str_casecmp_p(VALUE str1, VALUE str2)
4468{
4469 rb_encoding *enc;
4470 VALUE folded_str1, folded_str2;
4471 VALUE fold_opt = sym_fold;
4472
4473 enc = rb_enc_compatible(str1, str2);
4474 if (!enc) {
4475 return Qnil;
4476 }
4477
4478 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4479 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4480
4481 return rb_str_eql(folded_str1, folded_str2);
4482}
4483
4484static long
4485strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4486 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4487{
4488 const char *search_start = str_ptr;
4489 long pos, search_len = str_len - offset;
4490
4491 for (;;) {
4492 const char *t;
4493 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4494 if (pos < 0) return pos;
4495 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4496 if (t == search_start + pos) break;
4497 search_len -= t - search_start;
4498 if (search_len <= 0) return -1;
4499 offset += t - search_start;
4500 search_start = t;
4501 }
4502 return pos + offset;
4503}
4504
4505/* found index in byte */
4506#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4507#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4508
4509static long
4510rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4511{
4512 const char *str_ptr, *str_ptr_end, *sub_ptr;
4513 long str_len, sub_len;
4514 rb_encoding *enc;
4515
4516 enc = rb_enc_check(str, sub);
4517 if (is_broken_string(sub)) return -1;
4518
4519 str_ptr = RSTRING_PTR(str);
4520 str_ptr_end = RSTRING_END(str);
4521 str_len = RSTRING_LEN(str);
4522 sub_ptr = RSTRING_PTR(sub);
4523 sub_len = RSTRING_LEN(sub);
4524
4525 if (str_len < sub_len) return -1;
4526
4527 if (offset != 0) {
4528 long str_len_char, sub_len_char;
4529 int single_byte = single_byte_optimizable(str);
4530 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4531 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4532 if (offset < 0) {
4533 offset += str_len_char;
4534 if (offset < 0) return -1;
4535 }
4536 if (str_len_char - offset < sub_len_char) return -1;
4537 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4538 str_ptr += offset;
4539 }
4540 if (sub_len == 0) return offset;
4541
4542 /* need proceed one character at a time */
4543 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4544}
4545
4546
4547/*
4548 * call-seq:
4549 * index(pattern, offset = 0) -> integer or nil
4550 *
4551 * :include: doc/string/index.rdoc
4552 *
4553 */
4554
4555static VALUE
4556rb_str_index_m(int argc, VALUE *argv, VALUE str)
4557{
4558 VALUE sub;
4559 VALUE initpos;
4560 rb_encoding *enc = STR_ENC_GET(str);
4561 long pos;
4562
4563 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4564 long slen = str_strlen(str, enc); /* str's enc */
4565 pos = NUM2LONG(initpos);
4566 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4567 if (RB_TYPE_P(sub, T_REGEXP)) {
4569 }
4570 return Qnil;
4571 }
4572 }
4573 else {
4574 pos = 0;
4575 }
4576
4577 if (RB_TYPE_P(sub, T_REGEXP)) {
4578 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4579 enc, single_byte_optimizable(str));
4580
4581 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4582 VALUE match = rb_backref_get();
4583 struct re_registers *regs = RMATCH_REGS(match);
4584 pos = rb_str_sublen(str, BEG(0));
4585 return LONG2NUM(pos);
4586 }
4587 }
4588 else {
4589 StringValue(sub);
4590 pos = rb_str_index(str, sub, pos);
4591 if (pos >= 0) {
4592 pos = rb_str_sublen(str, pos);
4593 return LONG2NUM(pos);
4594 }
4595 }
4596 return Qnil;
4597}
4598
4599/* Ensure that the given pos is a valid character boundary.
4600 * Note that in this function, "character" means a code point
4601 * (Unicode scalar value), not a grapheme cluster.
4602 */
4603static void
4604str_ensure_byte_pos(VALUE str, long pos)
4605{
4606 if (!single_byte_optimizable(str)) {
4607 const char *s = RSTRING_PTR(str);
4608 const char *e = RSTRING_END(str);
4609 const char *p = s + pos;
4610 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4611 rb_raise(rb_eIndexError,
4612 "offset %ld does not land on character boundary", pos);
4613 }
4614 }
4615}
4616
4617/*
4618 * call-seq:
4619 * byteindex(object, offset = 0) -> integer or nil
4620 *
4621 * Returns the 0-based integer index of a substring of +self+
4622 * specified by +object+ (a string or Regexp) and +offset+,
4623 * or +nil+ if there is no such substring;
4624 * the returned index is the count of _bytes_ (not characters).
4625 *
4626 * When +object+ is a string,
4627 * returns the index of the first found substring equal to +object+:
4628 *
4629 * s = 'foo' # => "foo"
4630 * s.size # => 3 # Three 1-byte characters.
4631 * s.bytesize # => 3 # Three bytes.
4632 * s.byteindex('f') # => 0
4633 * s.byteindex('o') # => 1
4634 * s.byteindex('oo') # => 1
4635 * s.byteindex('ooo') # => nil
4636 *
4637 * When +object+ is a Regexp,
4638 * returns the index of the first found substring matching +object+;
4639 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4640 *
4641 * s = 'foo'
4642 * s.byteindex(/f/) # => 0
4643 * $~ # => #<MatchData "f">
4644 * s.byteindex(/o/) # => 1
4645 * s.byteindex(/oo/) # => 1
4646 * s.byteindex(/ooo/) # => nil
4647 * $~ # => nil
4648 *
4649 * \Integer argument +offset+, if given, specifies the 0-based index
4650 * of the byte where searching is to begin.
4651 *
4652 * When +offset+ is non-negative,
4653 * searching begins at byte position +offset+:
4654 *
4655 * s = 'foo'
4656 * s.byteindex('o', 1) # => 1
4657 * s.byteindex('o', 2) # => 2
4658 * s.byteindex('o', 3) # => nil
4659 *
4660 * When +offset+ is negative, counts backward from the end of +self+:
4661 *
4662 * s = 'foo'
4663 * s.byteindex('o', -1) # => 2
4664 * s.byteindex('o', -2) # => 1
4665 * s.byteindex('o', -3) # => 1
4666 * s.byteindex('o', -4) # => nil
4667 *
4668 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4669 *
4670 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4671 * s.size # => 2 # Two 3-byte characters.
4672 * s.bytesize # => 6 # Six bytes.
4673 * s.byteindex("\uFFFF") # => 0
4674 * s.byteindex("\uFFFF", 1) # Raises IndexError
4675 * s.byteindex("\uFFFF", 2) # Raises IndexError
4676 * s.byteindex("\uFFFF", 3) # => 3
4677 * s.byteindex("\uFFFF", 4) # Raises IndexError
4678 * s.byteindex("\uFFFF", 5) # Raises IndexError
4679 * s.byteindex("\uFFFF", 6) # => nil
4680 *
4681 * Related: see {Querying}[rdoc-ref:String@Querying].
4682 */
4683
4684static VALUE
4685rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4686{
4687 VALUE sub;
4688 VALUE initpos;
4689 long pos;
4690
4691 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4692 long slen = RSTRING_LEN(str);
4693 pos = NUM2LONG(initpos);
4694 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4695 if (RB_TYPE_P(sub, T_REGEXP)) {
4697 }
4698 return Qnil;
4699 }
4700 }
4701 else {
4702 pos = 0;
4703 }
4704
4705 str_ensure_byte_pos(str, pos);
4706
4707 if (RB_TYPE_P(sub, T_REGEXP)) {
4708 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4709 VALUE match = rb_backref_get();
4710 struct re_registers *regs = RMATCH_REGS(match);
4711 pos = BEG(0);
4712 return LONG2NUM(pos);
4713 }
4714 }
4715 else {
4716 StringValue(sub);
4717 pos = rb_str_byteindex(str, sub, pos);
4718 if (pos >= 0) return LONG2NUM(pos);
4719 }
4720 return Qnil;
4721}
4722
4723#ifndef HAVE_MEMRCHR
4724static void*
4725memrchr(const char *search_str, int chr, long search_len)
4726{
4727 const char *ptr = search_str + search_len;
4728 while (ptr > search_str) {
4729 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4730 }
4731
4732 return ((void *)0);
4733}
4734#endif
4735
4736static long
4737str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4738{
4739 char *hit, *adjusted;
4740 int c;
4741 long slen, searchlen;
4742 char *sbeg, *e, *t;
4743
4744 sbeg = RSTRING_PTR(str);
4745 slen = RSTRING_LEN(sub);
4746 if (slen == 0) return s - sbeg;
4747 e = RSTRING_END(str);
4748 t = RSTRING_PTR(sub);
4749 c = *t & 0xff;
4750 searchlen = s - sbeg + 1;
4751
4752 if (memcmp(s, t, slen) == 0) {
4753 return s - sbeg;
4754 }
4755
4756 do {
4757 hit = memrchr(sbeg, c, searchlen);
4758 if (!hit) break;
4759 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4760 if (hit != adjusted) {
4761 searchlen = adjusted - sbeg;
4762 continue;
4763 }
4764 if (memcmp(hit, t, slen) == 0)
4765 return hit - sbeg;
4766 searchlen = adjusted - sbeg;
4767 } while (searchlen > 0);
4768
4769 return -1;
4770}
4771
4772/* found index in byte */
4773static long
4774rb_str_rindex(VALUE str, VALUE sub, long pos)
4775{
4776 long len, slen;
4777 char *sbeg, *s;
4778 rb_encoding *enc;
4779 int singlebyte;
4780
4781 enc = rb_enc_check(str, sub);
4782 if (is_broken_string(sub)) return -1;
4783 singlebyte = single_byte_optimizable(str);
4784 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4785 slen = str_strlen(sub, enc); /* rb_enc_check */
4786
4787 /* substring longer than string */
4788 if (len < slen) return -1;
4789 if (len - pos < slen) pos = len - slen;
4790 if (len == 0) return pos;
4791
4792 sbeg = RSTRING_PTR(str);
4793
4794 if (pos == 0) {
4795 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4796 return 0;
4797 else
4798 return -1;
4799 }
4800
4801 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4802 return str_rindex(str, sub, s, enc);
4803}
4804
4805/*
4806 * call-seq:
4807 * rindex(pattern, offset = self.length) -> integer or nil
4808 *
4809 * :include:doc/string/rindex.rdoc
4810 *
4811 */
4812
4813static VALUE
4814rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4815{
4816 VALUE sub;
4817 VALUE initpos;
4818 rb_encoding *enc = STR_ENC_GET(str);
4819 long pos, len = str_strlen(str, enc); /* str's enc */
4820
4821 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4822 pos = NUM2LONG(initpos);
4823 if (pos < 0 && (pos += len) < 0) {
4824 if (RB_TYPE_P(sub, T_REGEXP)) {
4826 }
4827 return Qnil;
4828 }
4829 if (pos > len) pos = len;
4830 }
4831 else {
4832 pos = len;
4833 }
4834
4835 if (RB_TYPE_P(sub, T_REGEXP)) {
4836 /* enc = rb_enc_check(str, sub); */
4837 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4838 enc, single_byte_optimizable(str));
4839
4840 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4841 VALUE match = rb_backref_get();
4842 struct re_registers *regs = RMATCH_REGS(match);
4843 pos = rb_str_sublen(str, BEG(0));
4844 return LONG2NUM(pos);
4845 }
4846 }
4847 else {
4848 StringValue(sub);
4849 pos = rb_str_rindex(str, sub, pos);
4850 if (pos >= 0) {
4851 pos = rb_str_sublen(str, pos);
4852 return LONG2NUM(pos);
4853 }
4854 }
4855 return Qnil;
4856}
4857
4858static long
4859rb_str_byterindex(VALUE str, VALUE sub, long pos)
4860{
4861 long len, slen;
4862 char *sbeg, *s;
4863 rb_encoding *enc;
4864
4865 enc = rb_enc_check(str, sub);
4866 if (is_broken_string(sub)) return -1;
4867 len = RSTRING_LEN(str);
4868 slen = RSTRING_LEN(sub);
4869
4870 /* substring longer than string */
4871 if (len < slen) return -1;
4872 if (len - pos < slen) pos = len - slen;
4873 if (len == 0) return pos;
4874
4875 sbeg = RSTRING_PTR(str);
4876
4877 if (pos == 0) {
4878 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4879 return 0;
4880 else
4881 return -1;
4882 }
4883
4884 s = sbeg + pos;
4885 return str_rindex(str, sub, s, enc);
4886}
4887
4888/*
4889 * call-seq:
4890 * byterindex(object, offset = self.bytesize) -> integer or nil
4891 *
4892 * Returns the 0-based integer index of a substring of +self+
4893 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4894 * or +nil+ if there is no such substring;
4895 * the returned index is the count of _bytes_ (not characters).
4896 *
4897 * When +object+ is a string,
4898 * returns the index of the _last_ found substring equal to +object+:
4899 *
4900 * s = 'foo' # => "foo"
4901 * s.size # => 3 # Three 1-byte characters.
4902 * s.bytesize # => 3 # Three bytes.
4903 * s.byterindex('f') # => 0
4904 s.byterindex('o') # => 2
4905 s.byterindex('oo') # => 1
4906 s.byterindex('ooo') # => nil
4907 *
4908 * When +object+ is a Regexp,
4909 * returns the index of the last found substring matching +object+;
4910 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4911 *
4912 * s = 'foo'
4913 * s.byterindex(/f/) # => 0
4914 * $~ # => #<MatchData "f">
4915 * s.byterindex(/o/) # => 2
4916 * s.byterindex(/oo/) # => 1
4917 * s.byterindex(/ooo/) # => nil
4918 * $~ # => nil
4919 *
4920 * The last match means starting at the possible last position,
4921 * not the last of the longest matches:
4922 *
4923 * s = 'foo'
4924 * s.byterindex(/o+/) # => 2
4925 * $~ #=> #<MatchData "o">
4926 *
4927 * To get the last longest match, use a negative lookbehind:
4928 *
4929 * s = 'foo'
4930 * s.byterindex(/(?<!o)o+/) # => 1
4931 * $~ # => #<MatchData "oo">
4932 *
4933 * Or use method #byteindex with negative lookahead:
4934 *
4935 * s = 'foo'
4936 * s.byteindex(/o+(?!.*o)/) # => 1
4937 * $~ #=> #<MatchData "oo">
4938 *
4939 * \Integer argument +offset+, if given, specifies the 0-based index
4940 * of the byte where searching is to end.
4941 *
4942 * When +offset+ is non-negative,
4943 * searching ends at byte position +offset+:
4944 *
4945 * s = 'foo'
4946 * s.byterindex('o', 0) # => nil
4947 * s.byterindex('o', 1) # => 1
4948 * s.byterindex('o', 2) # => 2
4949 * s.byterindex('o', 3) # => 2
4950 *
4951 * When +offset+ is negative, counts backward from the end of +self+:
4952 *
4953 * s = 'foo'
4954 * s.byterindex('o', -1) # => 2
4955 * s.byterindex('o', -2) # => 1
4956 * s.byterindex('o', -3) # => nil
4957 *
4958 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4959 *
4960 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4961 * s.size # => 2 # Two 3-byte characters.
4962 * s.bytesize # => 6 # Six bytes.
4963 * s.byterindex("\uFFFF") # => 3
4964 * s.byterindex("\uFFFF", 1) # Raises IndexError
4965 * s.byterindex("\uFFFF", 2) # Raises IndexError
4966 * s.byterindex("\uFFFF", 3) # => 3
4967 * s.byterindex("\uFFFF", 4) # Raises IndexError
4968 * s.byterindex("\uFFFF", 5) # Raises IndexError
4969 * s.byterindex("\uFFFF", 6) # => nil
4970 *
4971 * Related: see {Querying}[rdoc-ref:String@Querying].
4972 */
4973
4974static VALUE
4975rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4976{
4977 VALUE sub;
4978 VALUE initpos;
4979 long pos, len = RSTRING_LEN(str);
4980
4981 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4982 pos = NUM2LONG(initpos);
4983 if (pos < 0 && (pos += len) < 0) {
4984 if (RB_TYPE_P(sub, T_REGEXP)) {
4986 }
4987 return Qnil;
4988 }
4989 if (pos > len) pos = len;
4990 }
4991 else {
4992 pos = len;
4993 }
4994
4995 str_ensure_byte_pos(str, pos);
4996
4997 if (RB_TYPE_P(sub, T_REGEXP)) {
4998 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4999 VALUE match = rb_backref_get();
5000 struct re_registers *regs = RMATCH_REGS(match);
5001 pos = BEG(0);
5002 return LONG2NUM(pos);
5003 }
5004 }
5005 else {
5006 StringValue(sub);
5007 pos = rb_str_byterindex(str, sub, pos);
5008 if (pos >= 0) return LONG2NUM(pos);
5009 }
5010 return Qnil;
5011}
5012
5013/*
5014 * call-seq:
5015 * self =~ object -> integer or nil
5016 *
5017 * When +object+ is a Regexp, returns the index of the first substring in +self+
5018 * matched by +object+,
5019 * or +nil+ if no match is found;
5020 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5021 *
5022 * 'foo' =~ /f/ # => 0
5023 * $~ # => #<MatchData "f">
5024 * 'foo' =~ /o/ # => 1
5025 * $~ # => #<MatchData "o">
5026 * 'foo' =~ /x/ # => nil
5027 * $~ # => nil
5028 *
5029 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5030 * (see Regexp#=~):
5031 *
5032 * number = nil
5033 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5034 * number # => nil # Not assigned.
5035 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5036 * number # => "9" # Assigned.
5037 *
5038 * If +object+ is not a Regexp, returns the value
5039 * returned by <tt>object =~ self</tt>.
5040 *
5041 * Related: see {Querying}[rdoc-ref:String@Querying].
5042 */
5043
5044static VALUE
5045rb_str_match(VALUE x, VALUE y)
5046{
5047 switch (OBJ_BUILTIN_TYPE(y)) {
5048 case T_STRING:
5049 rb_raise(rb_eTypeError, "type mismatch: String given");
5050
5051 case T_REGEXP:
5052 return rb_reg_match(y, x);
5053
5054 default:
5055 return rb_funcall(y, idEqTilde, 1, x);
5056 }
5057}
5058
5059
5060static VALUE get_pat(VALUE);
5061
5062
5063/*
5064 * call-seq:
5065 * match(pattern, offset = 0) -> matchdata or nil
5066 * match(pattern, offset = 0) {|matchdata| ... } -> object
5067 *
5068 * Creates a MatchData object based on +self+ and the given arguments;
5069 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5070 *
5071 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5072 *
5073 * regexp = Regexp.new(pattern)
5074 *
5075 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5076 * (see Regexp#match):
5077 *
5078 * matchdata = regexp.match(self[offset..])
5079 *
5080 * With no block given, returns the computed +matchdata+ or +nil+:
5081 *
5082 * 'foo'.match('f') # => #<MatchData "f">
5083 * 'foo'.match('o') # => #<MatchData "o">
5084 * 'foo'.match('x') # => nil
5085 * 'foo'.match('f', 1) # => nil
5086 * 'foo'.match('o', 1) # => #<MatchData "o">
5087 *
5088 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5089 * returns the block's return value:
5090 *
5091 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5092 *
5093 * With a block given and +nil+ +matchdata+, does not call the block:
5094 *
5095 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5096 *
5097 * Related: see {Querying}[rdoc-ref:String@Querying].
5098 */
5099
5100static VALUE
5101rb_str_match_m(int argc, VALUE *argv, VALUE str)
5102{
5103 VALUE re, result;
5104 if (argc < 1)
5105 rb_check_arity(argc, 1, 2);
5106 re = argv[0];
5107 argv[0] = str;
5108 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5109 if (!NIL_P(result) && rb_block_given_p()) {
5110 return rb_yield(result);
5111 }
5112 return result;
5113}
5114
5115/*
5116 * call-seq:
5117 * match?(pattern, offset = 0) -> true or false
5118 *
5119 * Returns whether a match is found for +self+ and the given arguments;
5120 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5121 *
5122 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5123 *
5124 * regexp = Regexp.new(pattern)
5125 *
5126 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5127 * +false+ otherwise:
5128 *
5129 * 'foo'.match?(/o/) # => true
5130 * 'foo'.match?('o') # => true
5131 * 'foo'.match?(/x/) # => false
5132 * 'foo'.match?('f', 1) # => false
5133 * 'foo'.match?('o', 1) # => true
5134 *
5135 * Related: see {Querying}[rdoc-ref:String@Querying].
5136 */
5137
5138static VALUE
5139rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5140{
5141 VALUE re;
5142 rb_check_arity(argc, 1, 2);
5143 re = get_pat(argv[0]);
5144 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5145}
5146
5147enum neighbor_char {
5148 NEIGHBOR_NOT_CHAR,
5149 NEIGHBOR_FOUND,
5150 NEIGHBOR_WRAPPED
5151};
5152
5153static enum neighbor_char
5154enc_succ_char(char *p, long len, rb_encoding *enc)
5155{
5156 long i;
5157 int l;
5158
5159 if (rb_enc_mbminlen(enc) > 1) {
5160 /* wchar, trivial case */
5161 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5162 if (!MBCLEN_CHARFOUND_P(r)) {
5163 return NEIGHBOR_NOT_CHAR;
5164 }
5165 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5166 l = rb_enc_code_to_mbclen(c, enc);
5167 if (!l) return NEIGHBOR_NOT_CHAR;
5168 if (l != len) return NEIGHBOR_WRAPPED;
5169 rb_enc_mbcput(c, p, enc);
5170 r = rb_enc_precise_mbclen(p, p + len, enc);
5171 if (!MBCLEN_CHARFOUND_P(r)) {
5172 return NEIGHBOR_NOT_CHAR;
5173 }
5174 return NEIGHBOR_FOUND;
5175 }
5176 while (1) {
5177 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5178 p[i] = '\0';
5179 if (i < 0)
5180 return NEIGHBOR_WRAPPED;
5181 ++((unsigned char*)p)[i];
5182 l = rb_enc_precise_mbclen(p, p+len, enc);
5183 if (MBCLEN_CHARFOUND_P(l)) {
5184 l = MBCLEN_CHARFOUND_LEN(l);
5185 if (l == len) {
5186 return NEIGHBOR_FOUND;
5187 }
5188 else {
5189 memset(p+l, 0xff, len-l);
5190 }
5191 }
5192 if (MBCLEN_INVALID_P(l) && i < len-1) {
5193 long len2;
5194 int l2;
5195 for (len2 = len-1; 0 < len2; len2--) {
5196 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5197 if (!MBCLEN_INVALID_P(l2))
5198 break;
5199 }
5200 memset(p+len2+1, 0xff, len-(len2+1));
5201 }
5202 }
5203}
5204
5205static enum neighbor_char
5206enc_pred_char(char *p, long len, rb_encoding *enc)
5207{
5208 long i;
5209 int l;
5210 if (rb_enc_mbminlen(enc) > 1) {
5211 /* wchar, trivial case */
5212 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5213 if (!MBCLEN_CHARFOUND_P(r)) {
5214 return NEIGHBOR_NOT_CHAR;
5215 }
5216 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5217 if (!c) return NEIGHBOR_NOT_CHAR;
5218 --c;
5219 l = rb_enc_code_to_mbclen(c, enc);
5220 if (!l) return NEIGHBOR_NOT_CHAR;
5221 if (l != len) return NEIGHBOR_WRAPPED;
5222 rb_enc_mbcput(c, p, enc);
5223 r = rb_enc_precise_mbclen(p, p + len, enc);
5224 if (!MBCLEN_CHARFOUND_P(r)) {
5225 return NEIGHBOR_NOT_CHAR;
5226 }
5227 return NEIGHBOR_FOUND;
5228 }
5229 while (1) {
5230 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5231 p[i] = '\xff';
5232 if (i < 0)
5233 return NEIGHBOR_WRAPPED;
5234 --((unsigned char*)p)[i];
5235 l = rb_enc_precise_mbclen(p, p+len, enc);
5236 if (MBCLEN_CHARFOUND_P(l)) {
5237 l = MBCLEN_CHARFOUND_LEN(l);
5238 if (l == len) {
5239 return NEIGHBOR_FOUND;
5240 }
5241 else {
5242 memset(p+l, 0, len-l);
5243 }
5244 }
5245 if (MBCLEN_INVALID_P(l) && i < len-1) {
5246 long len2;
5247 int l2;
5248 for (len2 = len-1; 0 < len2; len2--) {
5249 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5250 if (!MBCLEN_INVALID_P(l2))
5251 break;
5252 }
5253 memset(p+len2+1, 0, len-(len2+1));
5254 }
5255 }
5256}
5257
5258/*
5259 overwrite +p+ by succeeding letter in +enc+ and returns
5260 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5261 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5262 assuming each ranges are successive, and mbclen
5263 never change in each ranges.
5264 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5265 character.
5266 */
5267static enum neighbor_char
5268enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5269{
5270 enum neighbor_char ret;
5271 unsigned int c;
5272 int ctype;
5273 int range;
5274 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5275
5276 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5277 int try;
5278 const int max_gaps = 1;
5279
5280 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5281 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5282 ctype = ONIGENC_CTYPE_DIGIT;
5283 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5284 ctype = ONIGENC_CTYPE_ALPHA;
5285 else
5286 return NEIGHBOR_NOT_CHAR;
5287
5288 MEMCPY(save, p, char, len);
5289 for (try = 0; try <= max_gaps; ++try) {
5290 ret = enc_succ_char(p, len, enc);
5291 if (ret == NEIGHBOR_FOUND) {
5292 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5293 if (rb_enc_isctype(c, ctype, enc))
5294 return NEIGHBOR_FOUND;
5295 }
5296 }
5297 MEMCPY(p, save, char, len);
5298 range = 1;
5299 while (1) {
5300 MEMCPY(save, p, char, len);
5301 ret = enc_pred_char(p, len, enc);
5302 if (ret == NEIGHBOR_FOUND) {
5303 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5304 if (!rb_enc_isctype(c, ctype, enc)) {
5305 MEMCPY(p, save, char, len);
5306 break;
5307 }
5308 }
5309 else {
5310 MEMCPY(p, save, char, len);
5311 break;
5312 }
5313 range++;
5314 }
5315 if (range == 1) {
5316 return NEIGHBOR_NOT_CHAR;
5317 }
5318
5319 if (ctype != ONIGENC_CTYPE_DIGIT) {
5320 MEMCPY(carry, p, char, len);
5321 return NEIGHBOR_WRAPPED;
5322 }
5323
5324 MEMCPY(carry, p, char, len);
5325 enc_succ_char(carry, len, enc);
5326 return NEIGHBOR_WRAPPED;
5327}
5328
5329
5330static VALUE str_succ(VALUE str);
5331
5332/*
5333 * call-seq:
5334 * succ -> new_str
5335 *
5336 * :include: doc/string/succ.rdoc
5337 *
5338 */
5339
5340VALUE
5342{
5343 VALUE str;
5344 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5345 rb_enc_cr_str_copy_for_substr(str, orig);
5346 return str_succ(str);
5347}
5348
5349static VALUE
5350str_succ(VALUE str)
5351{
5352 rb_encoding *enc;
5353 char *sbeg, *s, *e, *last_alnum = 0;
5354 int found_alnum = 0;
5355 long l, slen;
5356 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5357 long carry_pos = 0, carry_len = 1;
5358 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5359
5360 slen = RSTRING_LEN(str);
5361 if (slen == 0) return str;
5362
5363 enc = STR_ENC_GET(str);
5364 sbeg = RSTRING_PTR(str);
5365 s = e = sbeg + slen;
5366
5367 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5368 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5369 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5370 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5371 break;
5372 }
5373 }
5374 l = rb_enc_precise_mbclen(s, e, enc);
5375 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5376 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5377 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5378 switch (neighbor) {
5379 case NEIGHBOR_NOT_CHAR:
5380 continue;
5381 case NEIGHBOR_FOUND:
5382 return str;
5383 case NEIGHBOR_WRAPPED:
5384 last_alnum = s;
5385 break;
5386 }
5387 found_alnum = 1;
5388 carry_pos = s - sbeg;
5389 carry_len = l;
5390 }
5391 if (!found_alnum) { /* str contains no alnum */
5392 s = e;
5393 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5394 enum neighbor_char neighbor;
5395 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5396 l = rb_enc_precise_mbclen(s, e, enc);
5397 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5398 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5399 MEMCPY(tmp, s, char, l);
5400 neighbor = enc_succ_char(tmp, l, enc);
5401 switch (neighbor) {
5402 case NEIGHBOR_FOUND:
5403 MEMCPY(s, tmp, char, l);
5404 return str;
5405 break;
5406 case NEIGHBOR_WRAPPED:
5407 MEMCPY(s, tmp, char, l);
5408 break;
5409 case NEIGHBOR_NOT_CHAR:
5410 break;
5411 }
5412 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5413 /* wrapped to \0...\0. search next valid char. */
5414 enc_succ_char(s, l, enc);
5415 }
5416 if (!rb_enc_asciicompat(enc)) {
5417 MEMCPY(carry, s, char, l);
5418 carry_len = l;
5419 }
5420 carry_pos = s - sbeg;
5421 }
5423 }
5424 RESIZE_CAPA(str, slen + carry_len);
5425 sbeg = RSTRING_PTR(str);
5426 s = sbeg + carry_pos;
5427 memmove(s + carry_len, s, slen - carry_pos);
5428 memmove(s, carry, carry_len);
5429 slen += carry_len;
5430 STR_SET_LEN(str, slen);
5431 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5433 return str;
5434}
5435
5436
5437/*
5438 * call-seq:
5439 * succ! -> self
5440 *
5441 * Like String#succ, but modifies +self+ in place; returns +self+.
5442 *
5443 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5444 */
5445
5446static VALUE
5447rb_str_succ_bang(VALUE str)
5448{
5449 rb_str_modify(str);
5450 str_succ(str);
5451 return str;
5452}
5453
5454static int
5455all_digits_p(const char *s, long len)
5456{
5457 while (len-- > 0) {
5458 if (!ISDIGIT(*s)) return 0;
5459 s++;
5460 }
5461 return 1;
5462}
5463
5464static int
5465str_upto_i(VALUE str, VALUE arg)
5466{
5467 rb_yield(str);
5468 return 0;
5469}
5470
5471/*
5472 * call-seq:
5473 * upto(other_string, exclusive = false) {|string| ... } -> self
5474 * upto(other_string, exclusive = false) -> new_enumerator
5475 *
5476 * :include: doc/string/upto.rdoc
5477 *
5478 */
5479
5480static VALUE
5481rb_str_upto(int argc, VALUE *argv, VALUE beg)
5482{
5483 VALUE end, exclusive;
5484
5485 rb_scan_args(argc, argv, "11", &end, &exclusive);
5486 RETURN_ENUMERATOR(beg, argc, argv);
5487 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5488}
5489
5490VALUE
5491rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5492{
5493 VALUE current, after_end;
5494 ID succ;
5495 int n, ascii;
5496 rb_encoding *enc;
5497
5498 CONST_ID(succ, "succ");
5499 StringValue(end);
5500 enc = rb_enc_check(beg, end);
5501 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5502 /* single character */
5503 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5504 char c = RSTRING_PTR(beg)[0];
5505 char e = RSTRING_PTR(end)[0];
5506
5507 if (c > e || (excl && c == e)) return beg;
5508 for (;;) {
5509 VALUE str = rb_enc_str_new(&c, 1, enc);
5511 if ((*each)(str, arg)) break;
5512 if (!excl && c == e) break;
5513 c++;
5514 if (excl && c == e) break;
5515 }
5516 return beg;
5517 }
5518 /* both edges are all digits */
5519 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5520 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5521 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5522 VALUE b, e;
5523 int width;
5524
5525 width = RSTRING_LENINT(beg);
5526 b = rb_str_to_inum(beg, 10, FALSE);
5527 e = rb_str_to_inum(end, 10, FALSE);
5528 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5529 long bi = FIX2LONG(b);
5530 long ei = FIX2LONG(e);
5531 rb_encoding *usascii = rb_usascii_encoding();
5532
5533 while (bi <= ei) {
5534 if (excl && bi == ei) break;
5535 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5536 bi++;
5537 }
5538 }
5539 else {
5540 ID op = excl ? '<' : idLE;
5541 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5542
5543 args[0] = INT2FIX(width);
5544 while (rb_funcall(b, op, 1, e)) {
5545 args[1] = b;
5546 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5547 b = rb_funcallv(b, succ, 0, 0);
5548 }
5549 }
5550 return beg;
5551 }
5552 /* normal case */
5553 n = rb_str_cmp(beg, end);
5554 if (n > 0 || (excl && n == 0)) return beg;
5555
5556 after_end = rb_funcallv(end, succ, 0, 0);
5557 current = str_duplicate(rb_cString, beg);
5558 while (!rb_str_equal(current, after_end)) {
5559 VALUE next = Qnil;
5560 if (excl || !rb_str_equal(current, end))
5561 next = rb_funcallv(current, succ, 0, 0);
5562 if ((*each)(current, arg)) break;
5563 if (NIL_P(next)) break;
5564 current = next;
5565 StringValue(current);
5566 if (excl && rb_str_equal(current, end)) break;
5567 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5568 break;
5569 }
5570
5571 return beg;
5572}
5573
5574VALUE
5575rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5576{
5577 VALUE current;
5578 ID succ;
5579
5580 CONST_ID(succ, "succ");
5581 /* both edges are all digits */
5582 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5583 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5584 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5585 int width = RSTRING_LENINT(beg);
5586 b = rb_str_to_inum(beg, 10, FALSE);
5587 if (FIXNUM_P(b)) {
5588 long bi = FIX2LONG(b);
5589 rb_encoding *usascii = rb_usascii_encoding();
5590
5591 while (FIXABLE(bi)) {
5592 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5593 bi++;
5594 }
5595 b = LONG2NUM(bi);
5596 }
5597 args[0] = INT2FIX(width);
5598 while (1) {
5599 args[1] = b;
5600 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5601 b = rb_funcallv(b, succ, 0, 0);
5602 }
5603 }
5604 /* normal case */
5605 current = str_duplicate(rb_cString, beg);
5606 while (1) {
5607 VALUE next = rb_funcallv(current, succ, 0, 0);
5608 if ((*each)(current, arg)) break;
5609 current = next;
5610 StringValue(current);
5611 if (RSTRING_LEN(current) == 0)
5612 break;
5613 }
5614
5615 return beg;
5616}
5617
5618static int
5619include_range_i(VALUE str, VALUE arg)
5620{
5621 VALUE *argp = (VALUE *)arg;
5622 if (!rb_equal(str, *argp)) return 0;
5623 *argp = Qnil;
5624 return 1;
5625}
5626
5627VALUE
5628rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5629{
5630 beg = rb_str_new_frozen(beg);
5631 StringValue(end);
5632 end = rb_str_new_frozen(end);
5633 if (NIL_P(val)) return Qfalse;
5634 val = rb_check_string_type(val);
5635 if (NIL_P(val)) return Qfalse;
5636 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5637 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5638 rb_enc_asciicompat(STR_ENC_GET(val))) {
5639 const char *bp = RSTRING_PTR(beg);
5640 const char *ep = RSTRING_PTR(end);
5641 const char *vp = RSTRING_PTR(val);
5642 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5643 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5644 return Qfalse;
5645 else {
5646 char b = *bp;
5647 char e = *ep;
5648 char v = *vp;
5649
5650 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5651 if (b <= v && v < e) return Qtrue;
5652 return RBOOL(!RTEST(exclusive) && v == e);
5653 }
5654 }
5655 }
5656#if 0
5657 /* both edges are all digits */
5658 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5659 all_digits_p(bp, RSTRING_LEN(beg)) &&
5660 all_digits_p(ep, RSTRING_LEN(end))) {
5661 /* TODO */
5662 }
5663#endif
5664 }
5665 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5666
5667 return RBOOL(NIL_P(val));
5668}
5669
5670static VALUE
5671rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5672{
5673 if (rb_reg_search(re, str, 0, 0) >= 0) {
5674 VALUE match = rb_backref_get();
5675 int nth = rb_reg_backref_number(match, backref);
5676 return rb_reg_nth_match(nth, match);
5677 }
5678 return Qnil;
5679}
5680
5681static VALUE
5682rb_str_aref(VALUE str, VALUE indx)
5683{
5684 long idx;
5685
5686 if (FIXNUM_P(indx)) {
5687 idx = FIX2LONG(indx);
5688 }
5689 else if (RB_TYPE_P(indx, T_REGEXP)) {
5690 return rb_str_subpat(str, indx, INT2FIX(0));
5691 }
5692 else if (RB_TYPE_P(indx, T_STRING)) {
5693 if (rb_str_index(str, indx, 0) != -1)
5694 return str_duplicate(rb_cString, indx);
5695 return Qnil;
5696 }
5697 else {
5698 /* check if indx is Range */
5699 long beg, len = str_strlen(str, NULL);
5700 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5701 case Qfalse:
5702 break;
5703 case Qnil:
5704 return Qnil;
5705 default:
5706 return rb_str_substr(str, beg, len);
5707 }
5708 idx = NUM2LONG(indx);
5709 }
5710
5711 return str_substr(str, idx, 1, FALSE);
5712}
5713
5714
5715/*
5716 * call-seq:
5717 * self[index] -> new_string or nil
5718 * self[start, length] -> new_string or nil
5719 * self[range] -> new_string or nil
5720 * self[regexp, capture = 0] -> new_string or nil
5721 * self[substring] -> new_string or nil
5722 *
5723 * :include: doc/string/aref.rdoc
5724 *
5725 */
5726
5727static VALUE
5728rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5729{
5730 if (argc == 2) {
5731 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5732 return rb_str_subpat(str, argv[0], argv[1]);
5733 }
5734 else {
5735 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5736 }
5737 }
5738 rb_check_arity(argc, 1, 2);
5739 return rb_str_aref(str, argv[0]);
5740}
5741
5742VALUE
5744{
5745 char *ptr = RSTRING_PTR(str);
5746 long olen = RSTRING_LEN(str), nlen;
5747
5748 str_modifiable(str);
5749 if (len > olen) len = olen;
5750 nlen = olen - len;
5751 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5752 char *oldptr = ptr;
5753 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5754 STR_SET_EMBED(str);
5755 ptr = RSTRING(str)->as.embed.ary;
5756 memmove(ptr, oldptr + len, nlen);
5757 if (fl == STR_NOEMBED) xfree(oldptr);
5758 }
5759 else {
5760 if (!STR_SHARED_P(str)) {
5761 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5762 rb_enc_cr_str_exact_copy(shared, str);
5763 OBJ_FREEZE(shared);
5764 }
5765 ptr = RSTRING(str)->as.heap.ptr += len;
5766 }
5767 STR_SET_LEN(str, nlen);
5768
5769 if (!SHARABLE_MIDDLE_SUBSTRING) {
5770 TERM_FILL(ptr + nlen, TERM_LEN(str));
5771 }
5773 return str;
5774}
5775
5776static void
5777rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5778{
5779 char *sptr;
5780 long slen;
5781 int cr;
5782
5783 if (beg == 0 && vlen == 0) {
5784 rb_str_drop_bytes(str, len);
5785 return;
5786 }
5787
5788 str_modify_keep_cr(str);
5789 RSTRING_GETMEM(str, sptr, slen);
5790 if (len < vlen) {
5791 /* expand string */
5792 RESIZE_CAPA(str, slen + vlen - len);
5793 sptr = RSTRING_PTR(str);
5794 }
5795
5797 cr = rb_enc_str_coderange(val);
5798 else
5800
5801 if (vlen != len) {
5802 memmove(sptr + beg + vlen,
5803 sptr + beg + len,
5804 slen - (beg + len));
5805 }
5806 if (vlen < beg && len < 0) {
5807 MEMZERO(sptr + slen, char, -len);
5808 }
5809 if (vlen > 0) {
5810 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5811 }
5812 slen += vlen - len;
5813 STR_SET_LEN(str, slen);
5814 TERM_FILL(&sptr[slen], TERM_LEN(str));
5815 ENC_CODERANGE_SET(str, cr);
5816}
5817
5818static inline void
5819rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5820{
5821 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5822}
5823
5824void
5825rb_str_update(VALUE str, long beg, long len, VALUE val)
5826{
5827 long slen;
5828 char *p, *e;
5829 rb_encoding *enc;
5830 int singlebyte = single_byte_optimizable(str);
5831 int cr;
5832
5833 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5834
5835 StringValue(val);
5836 enc = rb_enc_check(str, val);
5837 slen = str_strlen(str, enc); /* rb_enc_check */
5838
5839 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5840 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5841 }
5842 if (beg < 0) {
5843 beg += slen;
5844 }
5845 RUBY_ASSERT(beg >= 0);
5846 RUBY_ASSERT(beg <= slen);
5847
5848 if (len > slen - beg) {
5849 len = slen - beg;
5850 }
5851 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5852 if (!p) p = RSTRING_END(str);
5853 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5854 if (!e) e = RSTRING_END(str);
5855 /* error check */
5856 beg = p - RSTRING_PTR(str); /* physical position */
5857 len = e - p; /* physical length */
5858 rb_str_update_0(str, beg, len, val);
5859 rb_enc_associate(str, enc);
5861 if (cr != ENC_CODERANGE_BROKEN)
5862 ENC_CODERANGE_SET(str, cr);
5863}
5864
5865static void
5866rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5867{
5868 int nth;
5869 VALUE match;
5870 long start, end, len;
5871 rb_encoding *enc;
5872 struct re_registers *regs;
5873
5874 if (rb_reg_search(re, str, 0, 0) < 0) {
5875 rb_raise(rb_eIndexError, "regexp not matched");
5876 }
5877 match = rb_backref_get();
5878 nth = rb_reg_backref_number(match, backref);
5879 regs = RMATCH_REGS(match);
5880 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5881 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5882 }
5883 if (nth < 0) {
5884 nth += regs->num_regs;
5885 }
5886
5887 start = BEG(nth);
5888 if (start == -1) {
5889 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5890 }
5891 end = END(nth);
5892 len = end - start;
5893 StringValue(val);
5894 enc = rb_enc_check_str(str, val);
5895 rb_str_update_0(str, start, len, val);
5896 rb_enc_associate(str, enc);
5897}
5898
5899static VALUE
5900rb_str_aset(VALUE str, VALUE indx, VALUE val)
5901{
5902 long idx, beg;
5903
5904 switch (TYPE(indx)) {
5905 case T_REGEXP:
5906 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5907 return val;
5908
5909 case T_STRING:
5910 beg = rb_str_index(str, indx, 0);
5911 if (beg < 0) {
5912 rb_raise(rb_eIndexError, "string not matched");
5913 }
5914 beg = rb_str_sublen(str, beg);
5915 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5916 return val;
5917
5918 default:
5919 /* check if indx is Range */
5920 {
5921 long beg, len;
5922 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5923 rb_str_update(str, beg, len, val);
5924 return val;
5925 }
5926 }
5927 /* FALLTHROUGH */
5928
5929 case T_FIXNUM:
5930 idx = NUM2LONG(indx);
5931 rb_str_update(str, idx, 1, val);
5932 return val;
5933 }
5934}
5935
5936/*
5937 * call-seq:
5938 * self[index] = other_string -> new_string
5939 * self[start, length] = other_string -> new_string
5940 * self[range] = other_string -> new_string
5941 * self[regexp, capture = 0] = other_string -> new_string
5942 * self[substring] = other_string -> new_string
5943 *
5944 * :include: doc/string/aset.rdoc
5945 *
5946 */
5947
5948static VALUE
5949rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5950{
5951 if (argc == 3) {
5952 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5953 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5954 }
5955 else {
5956 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5957 }
5958 return argv[2];
5959 }
5960 rb_check_arity(argc, 2, 3);
5961 return rb_str_aset(str, argv[0], argv[1]);
5962}
5963
5964/*
5965 * call-seq:
5966 * insert(offset, other_string) -> self
5967 *
5968 * :include: doc/string/insert.rdoc
5969 *
5970 */
5971
5972static VALUE
5973rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5974{
5975 long pos = NUM2LONG(idx);
5976
5977 if (pos == -1) {
5978 return rb_str_append(str, str2);
5979 }
5980 else if (pos < 0) {
5981 pos++;
5982 }
5983 rb_str_update(str, pos, 0, str2);
5984 return str;
5985}
5986
5987
5988/*
5989 * call-seq:
5990 * slice!(index) -> new_string or nil
5991 * slice!(start, length) -> new_string or nil
5992 * slice!(range) -> new_string or nil
5993 * slice!(regexp, capture = 0) -> new_string or nil
5994 * slice!(substring) -> new_string or nil
5995 *
5996 * Like String#[] (and its alias String#slice), except that:
5997 *
5998 * - Performs substitutions in +self+ (not in a copy of +self+).
5999 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6000 *
6001 * A few examples:
6002 *
6003 * s = 'hello'
6004 * s.slice!('e') # => "e"
6005 * s # => "hllo"
6006 * s.slice!('e') # => nil
6007 * s # => "hllo"
6008 *
6009 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6010 */
6011
6012static VALUE
6013rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6014{
6015 VALUE result = Qnil;
6016 VALUE indx;
6017 long beg, len = 1;
6018 char *p;
6019
6020 rb_check_arity(argc, 1, 2);
6021 str_modify_keep_cr(str);
6022 indx = argv[0];
6023 if (RB_TYPE_P(indx, T_REGEXP)) {
6024 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6025 VALUE match = rb_backref_get();
6026 struct re_registers *regs = RMATCH_REGS(match);
6027 int nth = 0;
6028 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6029 if ((nth += regs->num_regs) <= 0) return Qnil;
6030 }
6031 else if (nth >= regs->num_regs) return Qnil;
6032 beg = BEG(nth);
6033 len = END(nth) - beg;
6034 goto subseq;
6035 }
6036 else if (argc == 2) {
6037 beg = NUM2LONG(indx);
6038 len = NUM2LONG(argv[1]);
6039 goto num_index;
6040 }
6041 else if (FIXNUM_P(indx)) {
6042 beg = FIX2LONG(indx);
6043 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6044 if (!len) return Qnil;
6045 beg = p - RSTRING_PTR(str);
6046 goto subseq;
6047 }
6048 else if (RB_TYPE_P(indx, T_STRING)) {
6049 beg = rb_str_index(str, indx, 0);
6050 if (beg == -1) return Qnil;
6051 len = RSTRING_LEN(indx);
6052 result = str_duplicate(rb_cString, indx);
6053 goto squash;
6054 }
6055 else {
6056 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6057 case Qnil:
6058 return Qnil;
6059 case Qfalse:
6060 beg = NUM2LONG(indx);
6061 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6062 if (!len) return Qnil;
6063 beg = p - RSTRING_PTR(str);
6064 goto subseq;
6065 default:
6066 goto num_index;
6067 }
6068 }
6069
6070 num_index:
6071 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6072 beg = p - RSTRING_PTR(str);
6073
6074 subseq:
6075 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6076 rb_enc_cr_str_copy_for_substr(result, str);
6077
6078 squash:
6079 if (len > 0) {
6080 if (beg == 0) {
6081 rb_str_drop_bytes(str, len);
6082 }
6083 else {
6084 char *sptr = RSTRING_PTR(str);
6085 long slen = RSTRING_LEN(str);
6086 if (beg + len > slen) /* pathological check */
6087 len = slen - beg;
6088 memmove(sptr + beg,
6089 sptr + beg + len,
6090 slen - (beg + len));
6091 slen -= len;
6092 STR_SET_LEN(str, slen);
6093 TERM_FILL(&sptr[slen], TERM_LEN(str));
6094 }
6095 }
6096 return result;
6097}
6098
6099static VALUE
6100get_pat(VALUE pat)
6101{
6102 VALUE val;
6103
6104 switch (OBJ_BUILTIN_TYPE(pat)) {
6105 case T_REGEXP:
6106 return pat;
6107
6108 case T_STRING:
6109 break;
6110
6111 default:
6112 val = rb_check_string_type(pat);
6113 if (NIL_P(val)) {
6114 Check_Type(pat, T_REGEXP);
6115 }
6116 pat = val;
6117 }
6118
6119 return rb_reg_regcomp(pat);
6120}
6121
6122static VALUE
6123get_pat_quoted(VALUE pat, int check)
6124{
6125 VALUE val;
6126
6127 switch (OBJ_BUILTIN_TYPE(pat)) {
6128 case T_REGEXP:
6129 return pat;
6130
6131 case T_STRING:
6132 break;
6133
6134 default:
6135 val = rb_check_string_type(pat);
6136 if (NIL_P(val)) {
6137 Check_Type(pat, T_REGEXP);
6138 }
6139 pat = val;
6140 }
6141 if (check && is_broken_string(pat)) {
6142 rb_exc_raise(rb_reg_check_preprocess(pat));
6143 }
6144 return pat;
6145}
6146
6147static long
6148rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6149{
6150 if (BUILTIN_TYPE(pat) == T_STRING) {
6151 pos = rb_str_byteindex(str, pat, pos);
6152 if (set_backref_str) {
6153 if (pos >= 0) {
6154 str = rb_str_new_frozen_String(str);
6155 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6156 if (match) {
6157 *match = match_data;
6158 }
6159 }
6160 else {
6162 }
6163 }
6164 return pos;
6165 }
6166 else {
6167 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6168 }
6169}
6170
6171static long
6172rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6173{
6174 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6175}
6176
6177
6178/*
6179 * call-seq:
6180 * sub!(pattern, replacement) -> self or nil
6181 * sub!(pattern) {|match| ... } -> self or nil
6182 *
6183 * Like String#sub, except that:
6184 *
6185 * - Changes are made to +self+, not to copy of +self+.
6186 * - Returns +self+ if any changes are made, +nil+ otherwise.
6187 *
6188 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6189 */
6190
6191static VALUE
6192rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6193{
6194 VALUE pat, repl, hash = Qnil;
6195 int iter = 0;
6196 long plen;
6197 int min_arity = rb_block_given_p() ? 1 : 2;
6198 long beg;
6199
6200 rb_check_arity(argc, min_arity, 2);
6201 if (argc == 1) {
6202 iter = 1;
6203 }
6204 else {
6205 repl = argv[1];
6206 hash = rb_check_hash_type(argv[1]);
6207 if (NIL_P(hash)) {
6208 StringValue(repl);
6209 }
6210 }
6211
6212 pat = get_pat_quoted(argv[0], 1);
6213
6214 str_modifiable(str);
6215 beg = rb_pat_search(pat, str, 0, 1);
6216 if (beg >= 0) {
6217 rb_encoding *enc;
6218 int cr = ENC_CODERANGE(str);
6219 long beg0, end0;
6220 VALUE match, match0 = Qnil;
6221 struct re_registers *regs;
6222 char *p, *rp;
6223 long len, rlen;
6224
6225 match = rb_backref_get();
6226 regs = RMATCH_REGS(match);
6227 if (RB_TYPE_P(pat, T_STRING)) {
6228 beg0 = beg;
6229 end0 = beg0 + RSTRING_LEN(pat);
6230 match0 = pat;
6231 }
6232 else {
6233 beg0 = BEG(0);
6234 end0 = END(0);
6235 if (iter) match0 = rb_reg_nth_match(0, match);
6236 }
6237
6238 if (iter || !NIL_P(hash)) {
6239 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6240
6241 if (iter) {
6242 repl = rb_obj_as_string(rb_yield(match0));
6243 }
6244 else {
6245 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6246 repl = rb_obj_as_string(repl);
6247 }
6248 str_mod_check(str, p, len);
6249 rb_check_frozen(str);
6250 }
6251 else {
6252 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6253 }
6254
6255 enc = rb_enc_compatible(str, repl);
6256 if (!enc) {
6257 rb_encoding *str_enc = STR_ENC_GET(str);
6258 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6259 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6260 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6261 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6262 rb_enc_inspect_name(str_enc),
6263 rb_enc_inspect_name(STR_ENC_GET(repl)));
6264 }
6265 enc = STR_ENC_GET(repl);
6266 }
6267 rb_str_modify(str);
6268 rb_enc_associate(str, enc);
6270 int cr2 = ENC_CODERANGE(repl);
6271 if (cr2 == ENC_CODERANGE_BROKEN ||
6272 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6274 else
6275 cr = cr2;
6276 }
6277 plen = end0 - beg0;
6278 rlen = RSTRING_LEN(repl);
6279 len = RSTRING_LEN(str);
6280 if (rlen > plen) {
6281 RESIZE_CAPA(str, len + rlen - plen);
6282 }
6283 p = RSTRING_PTR(str);
6284 if (rlen != plen) {
6285 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6286 }
6287 rp = RSTRING_PTR(repl);
6288 memmove(p + beg0, rp, rlen);
6289 len += rlen - plen;
6290 STR_SET_LEN(str, len);
6291 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6292 ENC_CODERANGE_SET(str, cr);
6293
6294 RB_GC_GUARD(match);
6295
6296 return str;
6297 }
6298 return Qnil;
6299}
6300
6301
6302/*
6303 * call-seq:
6304 * sub(pattern, replacement) -> new_string
6305 * sub(pattern) {|match| ... } -> new_string
6306 *
6307 * :include: doc/string/sub.rdoc
6308 */
6309
6310static VALUE
6311rb_str_sub(int argc, VALUE *argv, VALUE str)
6312{
6313 str = str_duplicate(rb_cString, str);
6314 rb_str_sub_bang(argc, argv, str);
6315 return str;
6316}
6317
6318static VALUE
6319str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6320{
6321 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6322 long beg, beg0, end0;
6323 long offset, blen, slen, len, last;
6324 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6325 char *sp, *cp;
6326 int need_backref_str = -1;
6327 rb_encoding *str_enc;
6328
6329 switch (argc) {
6330 case 1:
6331 RETURN_ENUMERATOR(str, argc, argv);
6332 mode = ITER;
6333 break;
6334 case 2:
6335 repl = argv[1];
6336 hash = rb_check_hash_type(argv[1]);
6337 if (NIL_P(hash)) {
6338 StringValue(repl);
6339 }
6340 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6341 mode = FAST_MAP;
6342 }
6343 else {
6344 mode = MAP;
6345 }
6346 break;
6347 default:
6348 rb_error_arity(argc, 1, 2);
6349 }
6350
6351 pat = get_pat_quoted(argv[0], 1);
6352 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6353
6354 if (beg < 0) {
6355 if (bang) return Qnil; /* no match, no substitution */
6356 return str_duplicate(rb_cString, str);
6357 }
6358
6359 offset = 0;
6360 blen = RSTRING_LEN(str) + 30; /* len + margin */
6361 dest = rb_str_buf_new(blen);
6362 sp = RSTRING_PTR(str);
6363 slen = RSTRING_LEN(str);
6364 cp = sp;
6365 str_enc = STR_ENC_GET(str);
6366 rb_enc_associate(dest, str_enc);
6367 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6368
6369 do {
6370 struct re_registers *regs = RMATCH_REGS(match);
6371 if (RB_TYPE_P(pat, T_STRING)) {
6372 beg0 = beg;
6373 end0 = beg0 + RSTRING_LEN(pat);
6374 match0 = pat;
6375 }
6376 else {
6377 beg0 = BEG(0);
6378 end0 = END(0);
6379 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6380 }
6381
6382 if (mode != STR) {
6383 if (mode == ITER) {
6384 val = rb_obj_as_string(rb_yield(match0));
6385 }
6386 else {
6387 struct RString fake_str = {RBASIC_INIT};
6388 VALUE key;
6389 if (mode == FAST_MAP) {
6390 // It is safe to use a fake_str here because we established that it won't escape,
6391 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6392 // default proc.
6393 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6394 }
6395 else {
6396 key = rb_str_subseq(str, beg0, end0 - beg0);
6397 }
6398 val = rb_hash_aref(hash, key);
6399 val = rb_obj_as_string(val);
6400 }
6401 str_mod_check(str, sp, slen);
6402 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6403 rb_raise(rb_eRuntimeError, "block should not cheat");
6404 }
6405 }
6406 else if (need_backref_str) {
6407 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6408 if (need_backref_str < 0) {
6409 need_backref_str = val != repl;
6410 }
6411 }
6412 else {
6413 val = repl;
6414 }
6415
6416 len = beg0 - offset; /* copy pre-match substr */
6417 if (len) {
6418 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6419 }
6420
6421 rb_str_buf_append(dest, val);
6422
6423 last = offset;
6424 offset = end0;
6425 if (beg0 == end0) {
6426 /*
6427 * Always consume at least one character of the input string
6428 * in order to prevent infinite loops.
6429 */
6430 if (RSTRING_LEN(str) <= end0) break;
6431 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6432 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6433 offset = end0 + len;
6434 }
6435 cp = RSTRING_PTR(str) + offset;
6436 if (offset > RSTRING_LEN(str)) break;
6437
6438 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6439 if (mode != FAST_MAP && mode != STR) {
6440 match = Qnil;
6441 }
6442 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6443
6444 RB_GC_GUARD(match);
6445 } while (beg >= 0);
6446
6447 if (RSTRING_LEN(str) > offset) {
6448 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6449 }
6450 rb_pat_search0(pat, str, last, 1, &match);
6451 if (bang) {
6452 str_shared_replace(str, dest);
6453 }
6454 else {
6455 str = dest;
6456 }
6457
6458 return str;
6459}
6460
6461
6462/*
6463 * call-seq:
6464 * gsub!(pattern, replacement) -> self or nil
6465 * gsub!(pattern) {|match| ... } -> self or nil
6466 * gsub!(pattern) -> an_enumerator
6467 *
6468 * Like String#gsub, except that:
6469 *
6470 * - Performs substitutions in +self+ (not in a copy of +self+).
6471 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6472 *
6473 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6474 */
6475
6476static VALUE
6477rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6478{
6479 str_modify_keep_cr(str);
6480 return str_gsub(argc, argv, str, 1);
6481}
6482
6483
6484/*
6485 * call-seq:
6486 * gsub(pattern, replacement) -> new_string
6487 * gsub(pattern) {|match| ... } -> new_string
6488 * gsub(pattern) -> enumerator
6489 *
6490 * Returns a copy of +self+ with zero or more substrings replaced.
6491 *
6492 * Argument +pattern+ may be a string or a Regexp;
6493 * argument +replacement+ may be a string or a Hash.
6494 * Varying types for the argument values makes this method very versatile.
6495 *
6496 * Below are some simple examples;
6497 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6498 *
6499 * With arguments +pattern+ and string +replacement+ given,
6500 * replaces each matching substring with the given +replacement+ string:
6501 *
6502 * s = 'abracadabra'
6503 * s.gsub('ab', 'AB') # => "ABracadABra"
6504 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6505 *
6506 * With arguments +pattern+ and hash +replacement+ given,
6507 * replaces each matching substring with a value from the given +replacement+ hash,
6508 * or removes it:
6509 *
6510 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6511 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6512 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6513 *
6514 * With argument +pattern+ and a block given,
6515 * calls the block with each matching substring;
6516 * replaces that substring with the block's return value:
6517 *
6518 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6519 * # => "ABrACADABrA"
6520 *
6521 * With argument +pattern+ and no block given,
6522 * returns a new Enumerator.
6523 *
6524 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6525 */
6526
6527static VALUE
6528rb_str_gsub(int argc, VALUE *argv, VALUE str)
6529{
6530 return str_gsub(argc, argv, str, 0);
6531}
6532
6533
6534/*
6535 * call-seq:
6536 * replace(other_string) -> self
6537 *
6538 * Replaces the contents of +self+ with the contents of +other_string+;
6539 * returns +self+:
6540 *
6541 * s = 'foo' # => "foo"
6542 * s.replace('bar') # => "bar"
6543 *
6544 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6545 */
6546
6547VALUE
6549{
6550 str_modifiable(str);
6551 if (str == str2) return str;
6552
6553 StringValue(str2);
6554 str_discard(str);
6555 return str_replace(str, str2);
6556}
6557
6558/*
6559 * call-seq:
6560 * clear -> self
6561 *
6562 * Removes the contents of +self+:
6563 *
6564 * s = 'foo'
6565 * s.clear # => ""
6566 * s # => ""
6567 *
6568 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6569 */
6570
6571static VALUE
6572rb_str_clear(VALUE str)
6573{
6574 str_discard(str);
6575 STR_SET_EMBED(str);
6576 STR_SET_LEN(str, 0);
6577 RSTRING_PTR(str)[0] = 0;
6578 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6580 else
6582 return str;
6583}
6584
6585/*
6586 * call-seq:
6587 * chr -> string
6588 *
6589 * :include: doc/string/chr.rdoc
6590 *
6591 */
6592
6593static VALUE
6594rb_str_chr(VALUE str)
6595{
6596 return rb_str_substr(str, 0, 1);
6597}
6598
6599/*
6600 * call-seq:
6601 * getbyte(index) -> integer or nil
6602 *
6603 * :include: doc/string/getbyte.rdoc
6604 *
6605 */
6606VALUE
6607rb_str_getbyte(VALUE str, VALUE index)
6608{
6609 long pos = NUM2LONG(index);
6610
6611 if (pos < 0)
6612 pos += RSTRING_LEN(str);
6613 if (pos < 0 || RSTRING_LEN(str) <= pos)
6614 return Qnil;
6615
6616 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6617}
6618
6619/*
6620 * call-seq:
6621 * setbyte(index, integer) -> integer
6622 *
6623 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6624 * returns +integer+:
6625 *
6626 * s = 'xyzzy'
6627 * s.setbyte(2, 129) # => 129
6628 * s # => "xy\x81zy"
6629 *
6630 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6631 */
6632VALUE
6633rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6634{
6635 long pos = NUM2LONG(index);
6636 long len = RSTRING_LEN(str);
6637 char *ptr, *head, *left = 0;
6638 rb_encoding *enc;
6639 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6640
6641 if (pos < -len || len <= pos)
6642 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6643 if (pos < 0)
6644 pos += len;
6645
6646 VALUE v = rb_to_int(value);
6647 VALUE w = rb_int_and(v, INT2FIX(0xff));
6648 char byte = (char)(NUM2INT(w) & 0xFF);
6649
6650 if (!str_independent(str))
6651 str_make_independent(str);
6652 enc = STR_ENC_GET(str);
6653 head = RSTRING_PTR(str);
6654 ptr = &head[pos];
6655 if (!STR_EMBED_P(str)) {
6656 cr = ENC_CODERANGE(str);
6657 switch (cr) {
6658 case ENC_CODERANGE_7BIT:
6659 left = ptr;
6660 *ptr = byte;
6661 if (ISASCII(byte)) goto end;
6662 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6663 if (!MBCLEN_CHARFOUND_P(nlen))
6665 else
6667 goto end;
6669 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6670 width = rb_enc_precise_mbclen(left, head+len, enc);
6671 *ptr = byte;
6672 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6673 if (!MBCLEN_CHARFOUND_P(nlen))
6675 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6677 goto end;
6678 }
6679 }
6681 *ptr = byte;
6682
6683 end:
6684 return value;
6685}
6686
6687static VALUE
6688str_byte_substr(VALUE str, long beg, long len, int empty)
6689{
6690 long n = RSTRING_LEN(str);
6691
6692 if (beg > n || len < 0) return Qnil;
6693 if (beg < 0) {
6694 beg += n;
6695 if (beg < 0) return Qnil;
6696 }
6697 if (len > n - beg)
6698 len = n - beg;
6699 if (len <= 0) {
6700 if (!empty) return Qnil;
6701 len = 0;
6702 }
6703
6704 VALUE str2 = str_subseq(str, beg, len);
6705
6706 str_enc_copy_direct(str2, str);
6707
6708 if (RSTRING_LEN(str2) == 0) {
6709 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6711 else
6713 }
6714 else {
6715 switch (ENC_CODERANGE(str)) {
6716 case ENC_CODERANGE_7BIT:
6718 break;
6719 default:
6721 break;
6722 }
6723 }
6724
6725 return str2;
6726}
6727
6728VALUE
6729rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6730{
6731 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6732}
6733
6734static VALUE
6735str_byte_aref(VALUE str, VALUE indx)
6736{
6737 long idx;
6738 if (FIXNUM_P(indx)) {
6739 idx = FIX2LONG(indx);
6740 }
6741 else {
6742 /* check if indx is Range */
6743 long beg, len = RSTRING_LEN(str);
6744
6745 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6746 case Qfalse:
6747 break;
6748 case Qnil:
6749 return Qnil;
6750 default:
6751 return str_byte_substr(str, beg, len, TRUE);
6752 }
6753
6754 idx = NUM2LONG(indx);
6755 }
6756 return str_byte_substr(str, idx, 1, FALSE);
6757}
6758
6759/*
6760 * call-seq:
6761 * byteslice(offset, length = 1) -> string or nil
6762 * byteslice(range) -> string or nil
6763 *
6764 * :include: doc/string/byteslice.rdoc
6765 */
6766
6767static VALUE
6768rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6769{
6770 if (argc == 2) {
6771 long beg = NUM2LONG(argv[0]);
6772 long len = NUM2LONG(argv[1]);
6773 return str_byte_substr(str, beg, len, TRUE);
6774 }
6775 rb_check_arity(argc, 1, 2);
6776 return str_byte_aref(str, argv[0]);
6777}
6778
6779static void
6780str_check_beg_len(VALUE str, long *beg, long *len)
6781{
6782 long end, slen = RSTRING_LEN(str);
6783
6784 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6785 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6786 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6787 }
6788 if (*beg < 0) {
6789 *beg += slen;
6790 }
6791 RUBY_ASSERT(*beg >= 0);
6792 RUBY_ASSERT(*beg <= slen);
6793
6794 if (*len > slen - *beg) {
6795 *len = slen - *beg;
6796 }
6797 end = *beg + *len;
6798 str_ensure_byte_pos(str, *beg);
6799 str_ensure_byte_pos(str, end);
6800}
6801
6802/*
6803 * call-seq:
6804 * bytesplice(offset, length, str) -> self
6805 * bytesplice(offset, length, str, str_offset, str_length) -> self
6806 * bytesplice(range, str) -> self
6807 * bytesplice(range, str, str_range) -> self
6808 *
6809 * :include: doc/string/bytesplice.rdoc
6810 */
6811
6812static VALUE
6813rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6814{
6815 long beg, len, vbeg, vlen;
6816 VALUE val;
6817 int cr;
6818
6819 rb_check_arity(argc, 2, 5);
6820 if (!(argc == 2 || argc == 3 || argc == 5)) {
6821 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6822 }
6823 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6824 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6825 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6826 rb_builtin_class_name(argv[0]));
6827 }
6828 val = argv[1];
6829 StringValue(val);
6830 if (argc == 2) {
6831 /* bytesplice(range, str) */
6832 vbeg = 0;
6833 vlen = RSTRING_LEN(val);
6834 }
6835 else {
6836 /* bytesplice(range, str, str_range) */
6837 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6838 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6839 rb_builtin_class_name(argv[2]));
6840 }
6841 }
6842 }
6843 else {
6844 beg = NUM2LONG(argv[0]);
6845 len = NUM2LONG(argv[1]);
6846 val = argv[2];
6847 StringValue(val);
6848 if (argc == 3) {
6849 /* bytesplice(index, length, str) */
6850 vbeg = 0;
6851 vlen = RSTRING_LEN(val);
6852 }
6853 else {
6854 /* bytesplice(index, length, str, str_index, str_length) */
6855 vbeg = NUM2LONG(argv[3]);
6856 vlen = NUM2LONG(argv[4]);
6857 }
6858 }
6859 str_check_beg_len(str, &beg, &len);
6860 str_check_beg_len(val, &vbeg, &vlen);
6861 str_modify_keep_cr(str);
6862
6863 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6864 rb_enc_associate(str, rb_enc_check(str, val));
6865 }
6866
6867 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6869 if (cr != ENC_CODERANGE_BROKEN)
6870 ENC_CODERANGE_SET(str, cr);
6871 return str;
6872}
6873
6874/*
6875 * call-seq:
6876 * reverse -> new_string
6877 *
6878 * Returns a new string with the characters from +self+ in reverse order.
6879 *
6880 * 'drawer'.reverse # => "reward"
6881 * 'reviled'.reverse # => "deliver"
6882 * 'stressed'.reverse # => "desserts"
6883 * 'semordnilaps'.reverse # => "spalindromes"
6884 *
6885 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6886 */
6887
6888static VALUE
6889rb_str_reverse(VALUE str)
6890{
6891 rb_encoding *enc;
6892 VALUE rev;
6893 char *s, *e, *p;
6894 int cr;
6895
6896 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6897 enc = STR_ENC_GET(str);
6898 rev = rb_str_new(0, RSTRING_LEN(str));
6899 s = RSTRING_PTR(str); e = RSTRING_END(str);
6900 p = RSTRING_END(rev);
6901 cr = ENC_CODERANGE(str);
6902
6903 if (RSTRING_LEN(str) > 1) {
6904 if (single_byte_optimizable(str)) {
6905 while (s < e) {
6906 *--p = *s++;
6907 }
6908 }
6909 else if (cr == ENC_CODERANGE_VALID) {
6910 while (s < e) {
6911 int clen = rb_enc_fast_mbclen(s, e, enc);
6912
6913 p -= clen;
6914 memcpy(p, s, clen);
6915 s += clen;
6916 }
6917 }
6918 else {
6919 cr = rb_enc_asciicompat(enc) ?
6921 while (s < e) {
6922 int clen = rb_enc_mbclen(s, e, enc);
6923
6924 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6925 p -= clen;
6926 memcpy(p, s, clen);
6927 s += clen;
6928 }
6929 }
6930 }
6931 STR_SET_LEN(rev, RSTRING_LEN(str));
6932 str_enc_copy_direct(rev, str);
6933 ENC_CODERANGE_SET(rev, cr);
6934
6935 return rev;
6936}
6937
6938
6939/*
6940 * call-seq:
6941 * reverse! -> self
6942 *
6943 * Returns +self+ with its characters reversed:
6944 *
6945 * 'drawer'.reverse! # => "reward"
6946 * 'reviled'.reverse! # => "deliver"
6947 * 'stressed'.reverse! # => "desserts"
6948 * 'semordnilaps'.reverse! # => "spalindromes"
6949 *
6950 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6951 */
6952
6953static VALUE
6954rb_str_reverse_bang(VALUE str)
6955{
6956 if (RSTRING_LEN(str) > 1) {
6957 if (single_byte_optimizable(str)) {
6958 char *s, *e, c;
6959
6960 str_modify_keep_cr(str);
6961 s = RSTRING_PTR(str);
6962 e = RSTRING_END(str) - 1;
6963 while (s < e) {
6964 c = *s;
6965 *s++ = *e;
6966 *e-- = c;
6967 }
6968 }
6969 else {
6970 str_shared_replace(str, rb_str_reverse(str));
6971 }
6972 }
6973 else {
6974 str_modify_keep_cr(str);
6975 }
6976 return str;
6977}
6978
6979
6980/*
6981 * call-seq:
6982 * include?(other_string) -> true or false
6983 *
6984 * Returns whether +self+ contains +other_string+:
6985 *
6986 * s = 'bar'
6987 * s.include?('ba') # => true
6988 * s.include?('ar') # => true
6989 * s.include?('bar') # => true
6990 * s.include?('a') # => true
6991 * s.include?('') # => true
6992 * s.include?('foo') # => false
6993 *
6994 * Related: see {Querying}[rdoc-ref:String@Querying].
6995 */
6996
6997VALUE
6998rb_str_include(VALUE str, VALUE arg)
6999{
7000 long i;
7001
7002 StringValue(arg);
7003 i = rb_str_index(str, arg, 0);
7004
7005 return RBOOL(i != -1);
7006}
7007
7008
7009/*
7010 * call-seq:
7011 * to_i(base = 10) -> integer
7012 *
7013 * Returns the result of interpreting leading characters in +self+
7014 * as an integer in the given +base+;
7015 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7016 *
7017 * '123456'.to_i # => 123456
7018 * '123def'.to_i(16) # => 1195503
7019 *
7020 * With +base+ zero given, string +object+ may contain leading characters
7021 * to specify the actual base:
7022 *
7023 * '123def'.to_i(0) # => 123
7024 * '0123def'.to_i(0) # => 83
7025 * '0b123def'.to_i(0) # => 1
7026 * '0o123def'.to_i(0) # => 83
7027 * '0d123def'.to_i(0) # => 123
7028 * '0x123def'.to_i(0) # => 1195503
7029 *
7030 * Characters past a leading valid number (in the given +base+) are ignored:
7031 *
7032 * '12.345'.to_i # => 12
7033 * '12345'.to_i(2) # => 1
7034 *
7035 * Returns zero if there is no leading valid number:
7036 *
7037 * 'abcdef'.to_i # => 0
7038 * '2'.to_i(2) # => 0
7039 *
7040 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7041 */
7042
7043static VALUE
7044rb_str_to_i(int argc, VALUE *argv, VALUE str)
7045{
7046 int base = 10;
7047
7048 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7049 rb_raise(rb_eArgError, "invalid radix %d", base);
7050 }
7051 return rb_str_to_inum(str, base, FALSE);
7052}
7053
7054
7055/*
7056 * call-seq:
7057 * to_f -> float
7058 *
7059 * Returns the result of interpreting leading characters in +self+ as a Float:
7060 *
7061 * '3.14159'.to_f # => 3.14159
7062 * '1.234e-2'.to_f # => 0.01234
7063 *
7064 * Characters past a leading valid number are ignored:
7065 *
7066 * '3.14 (pi to two places)'.to_f # => 3.14
7067 *
7068 * Returns zero if there is no leading valid number:
7069 *
7070 * 'abcdef'.to_f # => 0.0
7071 *
7072 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
7073 */
7074
7075static VALUE
7076rb_str_to_f(VALUE str)
7077{
7078 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7079}
7080
7081
7082/*
7083 * call-seq:
7084 * to_s -> self or new_string
7085 *
7086 * Returns +self+ if +self+ is a +String+,
7087 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7088 *
7089 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7090 */
7091
7092static VALUE
7093rb_str_to_s(VALUE str)
7094{
7095 if (rb_obj_class(str) != rb_cString) {
7096 return str_duplicate(rb_cString, str);
7097 }
7098 return str;
7099}
7100
7101#if 0
7102static void
7103str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7104{
7105 char s[RUBY_MAX_CHAR_LEN];
7106 int n = rb_enc_codelen(c, enc);
7107
7108 rb_enc_mbcput(c, s, enc);
7109 rb_enc_str_buf_cat(str, s, n, enc);
7110}
7111#endif
7112
7113#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7114
7115int
7116rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7117{
7118 char buf[CHAR_ESC_LEN + 1];
7119 int l;
7120
7121#if SIZEOF_INT > 4
7122 c &= 0xffffffff;
7123#endif
7124 if (unicode_p) {
7125 if (c < 0x7F && ISPRINT(c)) {
7126 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7127 }
7128 else if (c < 0x10000) {
7129 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7130 }
7131 else {
7132 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7133 }
7134 }
7135 else {
7136 if (c < 0x100) {
7137 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7138 }
7139 else {
7140 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7141 }
7142 }
7143 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7144 rb_str_buf_cat(result, buf, l);
7145 return l;
7146}
7147
7148const char *
7149ruby_escaped_char(int c)
7150{
7151 switch (c) {
7152 case '\0': return "\\0";
7153 case '\n': return "\\n";
7154 case '\r': return "\\r";
7155 case '\t': return "\\t";
7156 case '\f': return "\\f";
7157 case '\013': return "\\v";
7158 case '\010': return "\\b";
7159 case '\007': return "\\a";
7160 case '\033': return "\\e";
7161 case '\x7f': return "\\c?";
7162 }
7163 return NULL;
7164}
7165
7166VALUE
7167rb_str_escape(VALUE str)
7168{
7169 int encidx = ENCODING_GET(str);
7170 rb_encoding *enc = rb_enc_from_index(encidx);
7171 const char *p = RSTRING_PTR(str);
7172 const char *pend = RSTRING_END(str);
7173 const char *prev = p;
7174 char buf[CHAR_ESC_LEN + 1];
7175 VALUE result = rb_str_buf_new(0);
7176 int unicode_p = rb_enc_unicode_p(enc);
7177 int asciicompat = rb_enc_asciicompat(enc);
7178
7179 while (p < pend) {
7180 unsigned int c;
7181 const char *cc;
7182 int n = rb_enc_precise_mbclen(p, pend, enc);
7183 if (!MBCLEN_CHARFOUND_P(n)) {
7184 if (p > prev) str_buf_cat(result, prev, p - prev);
7185 n = rb_enc_mbminlen(enc);
7186 if (pend < p + n)
7187 n = (int)(pend - p);
7188 while (n--) {
7189 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7190 str_buf_cat(result, buf, strlen(buf));
7191 prev = ++p;
7192 }
7193 continue;
7194 }
7195 n = MBCLEN_CHARFOUND_LEN(n);
7196 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7197 p += n;
7198 cc = ruby_escaped_char(c);
7199 if (cc) {
7200 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7201 str_buf_cat(result, cc, strlen(cc));
7202 prev = p;
7203 }
7204 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7205 }
7206 else {
7207 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7208 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7209 prev = p;
7210 }
7211 }
7212 if (p > prev) str_buf_cat(result, prev, p - prev);
7214
7215 return result;
7216}
7217
7218/*
7219 * call-seq:
7220 * inspect -> string
7221 *
7222 * :include: doc/string/inspect.rdoc
7223 *
7224 */
7225
7226VALUE
7228{
7229 int encidx = ENCODING_GET(str);
7230 rb_encoding *enc = rb_enc_from_index(encidx);
7231 const char *p, *pend, *prev;
7232 char buf[CHAR_ESC_LEN + 1];
7233 VALUE result = rb_str_buf_new(0);
7234 rb_encoding *resenc = rb_default_internal_encoding();
7235 int unicode_p = rb_enc_unicode_p(enc);
7236 int asciicompat = rb_enc_asciicompat(enc);
7237
7238 if (resenc == NULL) resenc = rb_default_external_encoding();
7239 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7240 rb_enc_associate(result, resenc);
7241 str_buf_cat2(result, "\"");
7242
7243 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7244 prev = p;
7245 while (p < pend) {
7246 unsigned int c, cc;
7247 int n;
7248
7249 n = rb_enc_precise_mbclen(p, pend, enc);
7250 if (!MBCLEN_CHARFOUND_P(n)) {
7251 if (p > prev) str_buf_cat(result, prev, p - prev);
7252 n = rb_enc_mbminlen(enc);
7253 if (pend < p + n)
7254 n = (int)(pend - p);
7255 while (n--) {
7256 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7257 str_buf_cat(result, buf, strlen(buf));
7258 prev = ++p;
7259 }
7260 continue;
7261 }
7262 n = MBCLEN_CHARFOUND_LEN(n);
7263 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7264 p += n;
7265 if ((asciicompat || unicode_p) &&
7266 (c == '"'|| c == '\\' ||
7267 (c == '#' &&
7268 p < pend &&
7269 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7270 (cc = rb_enc_codepoint(p,pend,enc),
7271 (cc == '$' || cc == '@' || cc == '{'))))) {
7272 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7273 str_buf_cat2(result, "\\");
7274 if (asciicompat || enc == resenc) {
7275 prev = p - n;
7276 continue;
7277 }
7278 }
7279 switch (c) {
7280 case '\n': cc = 'n'; break;
7281 case '\r': cc = 'r'; break;
7282 case '\t': cc = 't'; break;
7283 case '\f': cc = 'f'; break;
7284 case '\013': cc = 'v'; break;
7285 case '\010': cc = 'b'; break;
7286 case '\007': cc = 'a'; break;
7287 case 033: cc = 'e'; break;
7288 default: cc = 0; break;
7289 }
7290 if (cc) {
7291 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7292 buf[0] = '\\';
7293 buf[1] = (char)cc;
7294 str_buf_cat(result, buf, 2);
7295 prev = p;
7296 continue;
7297 }
7298 /* The special casing of 0x85 (NEXT_LINE) here is because
7299 * Oniguruma historically treats it as printable, but it
7300 * doesn't match the print POSIX bracket class or character
7301 * property in regexps.
7302 *
7303 * See Ruby Bug #16842 for details:
7304 * https://bugs.ruby-lang.org/issues/16842
7305 */
7306 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7307 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7308 continue;
7309 }
7310 else {
7311 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7312 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7313 prev = p;
7314 continue;
7315 }
7316 }
7317 if (p > prev) str_buf_cat(result, prev, p - prev);
7318 str_buf_cat2(result, "\"");
7319
7320 return result;
7321}
7322
7323#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7324
7325/*
7326 * call-seq:
7327 * dump -> new_string
7328 *
7329 * :include: doc/string/dump.rdoc
7330 *
7331 */
7332
7333VALUE
7335{
7336 int encidx = rb_enc_get_index(str);
7337 rb_encoding *enc = rb_enc_from_index(encidx);
7338 long len;
7339 const char *p, *pend;
7340 char *q, *qend;
7341 VALUE result;
7342 int u8 = (encidx == rb_utf8_encindex());
7343 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7344
7345 len = 2; /* "" */
7346 if (!rb_enc_asciicompat(enc)) {
7347 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7348 len += strlen(enc->name);
7349 }
7350
7351 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7352 while (p < pend) {
7353 int clen;
7354 unsigned char c = *p++;
7355
7356 switch (c) {
7357 case '"': case '\\':
7358 case '\n': case '\r':
7359 case '\t': case '\f':
7360 case '\013': case '\010': case '\007': case '\033':
7361 clen = 2;
7362 break;
7363
7364 case '#':
7365 clen = IS_EVSTR(p, pend) ? 2 : 1;
7366 break;
7367
7368 default:
7369 if (ISPRINT(c)) {
7370 clen = 1;
7371 }
7372 else {
7373 if (u8 && c > 0x7F) { /* \u notation */
7374 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7375 if (MBCLEN_CHARFOUND_P(n)) {
7376 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7377 if (cc <= 0xFFFF)
7378 clen = 6; /* \uXXXX */
7379 else if (cc <= 0xFFFFF)
7380 clen = 9; /* \u{XXXXX} */
7381 else
7382 clen = 10; /* \u{XXXXXX} */
7383 p += MBCLEN_CHARFOUND_LEN(n)-1;
7384 break;
7385 }
7386 }
7387 clen = 4; /* \xNN */
7388 }
7389 break;
7390 }
7391
7392 if (clen > LONG_MAX - len) {
7393 rb_raise(rb_eRuntimeError, "string size too big");
7394 }
7395 len += clen;
7396 }
7397
7398 result = rb_str_new(0, len);
7399 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7400 q = RSTRING_PTR(result); qend = q + len + 1;
7401
7402 *q++ = '"';
7403 while (p < pend) {
7404 unsigned char c = *p++;
7405
7406 if (c == '"' || c == '\\') {
7407 *q++ = '\\';
7408 *q++ = c;
7409 }
7410 else if (c == '#') {
7411 if (IS_EVSTR(p, pend)) *q++ = '\\';
7412 *q++ = '#';
7413 }
7414 else if (c == '\n') {
7415 *q++ = '\\';
7416 *q++ = 'n';
7417 }
7418 else if (c == '\r') {
7419 *q++ = '\\';
7420 *q++ = 'r';
7421 }
7422 else if (c == '\t') {
7423 *q++ = '\\';
7424 *q++ = 't';
7425 }
7426 else if (c == '\f') {
7427 *q++ = '\\';
7428 *q++ = 'f';
7429 }
7430 else if (c == '\013') {
7431 *q++ = '\\';
7432 *q++ = 'v';
7433 }
7434 else if (c == '\010') {
7435 *q++ = '\\';
7436 *q++ = 'b';
7437 }
7438 else if (c == '\007') {
7439 *q++ = '\\';
7440 *q++ = 'a';
7441 }
7442 else if (c == '\033') {
7443 *q++ = '\\';
7444 *q++ = 'e';
7445 }
7446 else if (ISPRINT(c)) {
7447 *q++ = c;
7448 }
7449 else {
7450 *q++ = '\\';
7451 if (u8) {
7452 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7453 if (MBCLEN_CHARFOUND_P(n)) {
7454 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7455 p += n;
7456 if (cc <= 0xFFFF)
7457 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7458 else
7459 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7460 q += strlen(q);
7461 continue;
7462 }
7463 }
7464 snprintf(q, qend-q, "x%02X", c);
7465 q += 3;
7466 }
7467 }
7468 *q++ = '"';
7469 *q = '\0';
7470 if (!rb_enc_asciicompat(enc)) {
7471 snprintf(q, qend-q, nonascii_suffix, enc->name);
7472 encidx = rb_ascii8bit_encindex();
7473 }
7474 /* result from dump is ASCII */
7475 rb_enc_associate_index(result, encidx);
7477 return result;
7478}
7479
7480static int
7481unescape_ascii(unsigned int c)
7482{
7483 switch (c) {
7484 case 'n':
7485 return '\n';
7486 case 'r':
7487 return '\r';
7488 case 't':
7489 return '\t';
7490 case 'f':
7491 return '\f';
7492 case 'v':
7493 return '\13';
7494 case 'b':
7495 return '\010';
7496 case 'a':
7497 return '\007';
7498 case 'e':
7499 return 033;
7500 }
7502}
7503
7504static void
7505undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7506{
7507 const char *s = *ss;
7508 unsigned int c;
7509 int codelen;
7510 size_t hexlen;
7511 unsigned char buf[6];
7512 static rb_encoding *enc_utf8 = NULL;
7513
7514 switch (*s) {
7515 case '\\':
7516 case '"':
7517 case '#':
7518 rb_str_cat(undumped, s, 1); /* cat itself */
7519 s++;
7520 break;
7521 case 'n':
7522 case 'r':
7523 case 't':
7524 case 'f':
7525 case 'v':
7526 case 'b':
7527 case 'a':
7528 case 'e':
7529 *buf = unescape_ascii(*s);
7530 rb_str_cat(undumped, (char *)buf, 1);
7531 s++;
7532 break;
7533 case 'u':
7534 if (*binary) {
7535 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7536 }
7537 *utf8 = true;
7538 if (++s >= s_end) {
7539 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7540 }
7541 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7542 if (*penc != enc_utf8) {
7543 *penc = enc_utf8;
7544 rb_enc_associate(undumped, enc_utf8);
7545 }
7546 if (*s == '{') { /* handle \u{...} form */
7547 s++;
7548 for (;;) {
7549 if (s >= s_end) {
7550 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7551 }
7552 if (*s == '}') {
7553 s++;
7554 break;
7555 }
7556 if (ISSPACE(*s)) {
7557 s++;
7558 continue;
7559 }
7560 c = scan_hex(s, s_end-s, &hexlen);
7561 if (hexlen == 0 || hexlen > 6) {
7562 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7563 }
7564 if (c > 0x10ffff) {
7565 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7566 }
7567 if (0xd800 <= c && c <= 0xdfff) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7569 }
7570 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7571 rb_str_cat(undumped, (char *)buf, codelen);
7572 s += hexlen;
7573 }
7574 }
7575 else { /* handle \uXXXX form */
7576 c = scan_hex(s, 4, &hexlen);
7577 if (hexlen != 4) {
7578 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7579 }
7580 if (0xd800 <= c && c <= 0xdfff) {
7581 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7582 }
7583 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7584 rb_str_cat(undumped, (char *)buf, codelen);
7585 s += hexlen;
7586 }
7587 break;
7588 case 'x':
7589 if (++s >= s_end) {
7590 rb_raise(rb_eRuntimeError, "invalid hex escape");
7591 }
7592 *buf = scan_hex(s, 2, &hexlen);
7593 if (hexlen != 2) {
7594 rb_raise(rb_eRuntimeError, "invalid hex escape");
7595 }
7596 if (!ISASCII(*buf)) {
7597 if (*utf8) {
7598 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7599 }
7600 *binary = true;
7601 }
7602 rb_str_cat(undumped, (char *)buf, 1);
7603 s += hexlen;
7604 break;
7605 default:
7606 rb_str_cat(undumped, s-1, 2);
7607 s++;
7608 }
7609
7610 *ss = s;
7611}
7612
7613static VALUE rb_str_is_ascii_only_p(VALUE str);
7614
7615/*
7616 * call-seq:
7617 * undump -> new_string
7618 *
7619 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7620 *
7621 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7622 */
7623
7624static VALUE
7625str_undump(VALUE str)
7626{
7627 const char *s = RSTRING_PTR(str);
7628 const char *s_end = RSTRING_END(str);
7629 rb_encoding *enc = rb_enc_get(str);
7630 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7631 bool utf8 = false;
7632 bool binary = false;
7633 int w;
7634
7636 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7637 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7638 }
7639 if (!str_null_check(str, &w)) {
7640 rb_raise(rb_eRuntimeError, "string contains null byte");
7641 }
7642 if (RSTRING_LEN(str) < 2) goto invalid_format;
7643 if (*s != '"') goto invalid_format;
7644
7645 /* strip '"' at the start */
7646 s++;
7647
7648 for (;;) {
7649 if (s >= s_end) {
7650 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7651 }
7652
7653 if (*s == '"') {
7654 /* epilogue */
7655 s++;
7656 if (s == s_end) {
7657 /* ascii compatible dumped string */
7658 break;
7659 }
7660 else {
7661 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7662 static const char dup_suffix[] = ".dup";
7663 const char *encname;
7664 int encidx;
7665 ptrdiff_t size;
7666
7667 /* check separately for strings dumped by older versions */
7668 size = sizeof(dup_suffix) - 1;
7669 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7670
7671 size = sizeof(force_encoding_suffix) - 1;
7672 if (s_end - s <= size) goto invalid_format;
7673 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7674 s += size;
7675
7676 if (utf8) {
7677 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7678 }
7679
7680 encname = s;
7681 s = memchr(s, '"', s_end-s);
7682 size = s - encname;
7683 if (!s) goto invalid_format;
7684 if (s_end - s != 2) goto invalid_format;
7685 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7686
7687 encidx = rb_enc_find_index2(encname, (long)size);
7688 if (encidx < 0) {
7689 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7690 }
7691 rb_enc_associate_index(undumped, encidx);
7692 }
7693 break;
7694 }
7695
7696 if (*s == '\\') {
7697 s++;
7698 if (s >= s_end) {
7699 rb_raise(rb_eRuntimeError, "invalid escape");
7700 }
7701 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7702 }
7703 else {
7704 rb_str_cat(undumped, s++, 1);
7705 }
7706 }
7707
7708 RB_GC_GUARD(str);
7709
7710 return undumped;
7711invalid_format:
7712 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7713}
7714
7715static void
7716rb_str_check_dummy_enc(rb_encoding *enc)
7717{
7718 if (rb_enc_dummy_p(enc)) {
7719 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7720 rb_enc_name(enc));
7721 }
7722}
7723
7724static rb_encoding *
7725str_true_enc(VALUE str)
7726{
7727 rb_encoding *enc = STR_ENC_GET(str);
7728 rb_str_check_dummy_enc(enc);
7729 return enc;
7730}
7731
7732static OnigCaseFoldType
7733check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7734{
7735 if (argc==0)
7736 return flags;
7737 if (argc>2)
7738 rb_raise(rb_eArgError, "too many options");
7739 if (argv[0]==sym_turkic) {
7740 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7741 if (argc==2) {
7742 if (argv[1]==sym_lithuanian)
7743 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7744 else
7745 rb_raise(rb_eArgError, "invalid second option");
7746 }
7747 }
7748 else if (argv[0]==sym_lithuanian) {
7749 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7750 if (argc==2) {
7751 if (argv[1]==sym_turkic)
7752 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7753 else
7754 rb_raise(rb_eArgError, "invalid second option");
7755 }
7756 }
7757 else if (argc>1)
7758 rb_raise(rb_eArgError, "too many options");
7759 else if (argv[0]==sym_ascii)
7760 flags |= ONIGENC_CASE_ASCII_ONLY;
7761 else if (argv[0]==sym_fold) {
7762 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7763 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7764 else
7765 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7766 }
7767 else
7768 rb_raise(rb_eArgError, "invalid option");
7769 return flags;
7770}
7771
7772static inline bool
7773case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7774{
7775 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7776 return true;
7777 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7778}
7779
7780/* 16 should be long enough to absorb any kind of single character length increase */
7781#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7782#ifndef CASEMAP_DEBUG
7783# define CASEMAP_DEBUG 0
7784#endif
7785
7786struct mapping_buffer;
7787typedef struct mapping_buffer {
7788 size_t capa;
7789 size_t used;
7790 struct mapping_buffer *next;
7791 OnigUChar space[FLEX_ARY_LEN];
7793
7794static void
7795mapping_buffer_free(void *p)
7796{
7797 mapping_buffer *previous_buffer;
7798 mapping_buffer *current_buffer = p;
7799 while (current_buffer) {
7800 previous_buffer = current_buffer;
7801 current_buffer = current_buffer->next;
7802 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7803 }
7804}
7805
7806static const rb_data_type_t mapping_buffer_type = {
7807 "mapping_buffer",
7808 {0, mapping_buffer_free,},
7809 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7810};
7811
7812static VALUE
7813rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7814{
7815 VALUE target;
7816
7817 const OnigUChar *source_current, *source_end;
7818 int target_length = 0;
7819 VALUE buffer_anchor;
7820 mapping_buffer *current_buffer = 0;
7821 mapping_buffer **pre_buffer;
7822 size_t buffer_count = 0;
7823 int buffer_length_or_invalid;
7824
7825 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7826
7827 source_current = (OnigUChar*)RSTRING_PTR(source);
7828 source_end = (OnigUChar*)RSTRING_END(source);
7829
7830 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7831 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7832 while (source_current < source_end) {
7833 /* increase multiplier using buffer count to converge quickly */
7834 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7835 if (CASEMAP_DEBUG) {
7836 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7837 }
7838 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7839 *pre_buffer = current_buffer;
7840 pre_buffer = &current_buffer->next;
7841 current_buffer->next = NULL;
7842 current_buffer->capa = capa;
7843 buffer_length_or_invalid = enc->case_map(flags,
7844 &source_current, source_end,
7845 current_buffer->space,
7846 current_buffer->space+current_buffer->capa,
7847 enc);
7848 if (buffer_length_or_invalid < 0) {
7849 current_buffer = DATA_PTR(buffer_anchor);
7850 DATA_PTR(buffer_anchor) = 0;
7851 mapping_buffer_free(current_buffer);
7852 rb_raise(rb_eArgError, "input string invalid");
7853 }
7854 target_length += current_buffer->used = buffer_length_or_invalid;
7855 }
7856 if (CASEMAP_DEBUG) {
7857 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7858 }
7859
7860 if (buffer_count==1) {
7861 target = rb_str_new((const char*)current_buffer->space, target_length);
7862 }
7863 else {
7864 char *target_current;
7865
7866 target = rb_str_new(0, target_length);
7867 target_current = RSTRING_PTR(target);
7868 current_buffer = DATA_PTR(buffer_anchor);
7869 while (current_buffer) {
7870 memcpy(target_current, current_buffer->space, current_buffer->used);
7871 target_current += current_buffer->used;
7872 current_buffer = current_buffer->next;
7873 }
7874 }
7875 current_buffer = DATA_PTR(buffer_anchor);
7876 DATA_PTR(buffer_anchor) = 0;
7877 mapping_buffer_free(current_buffer);
7878
7879 RB_GC_GUARD(buffer_anchor);
7880
7881 /* TODO: check about string terminator character */
7882 str_enc_copy_direct(target, source);
7883 /*ENC_CODERANGE_SET(mapped, cr);*/
7884
7885 return target;
7886}
7887
7888static VALUE
7889rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7890{
7891 const OnigUChar *source_current, *source_end;
7892 OnigUChar *target_current, *target_end;
7893 long old_length = RSTRING_LEN(source);
7894 int length_or_invalid;
7895
7896 if (old_length == 0) return Qnil;
7897
7898 source_current = (OnigUChar*)RSTRING_PTR(source);
7899 source_end = (OnigUChar*)RSTRING_END(source);
7900 if (source == target) {
7901 target_current = (OnigUChar*)source_current;
7902 target_end = (OnigUChar*)source_end;
7903 }
7904 else {
7905 target_current = (OnigUChar*)RSTRING_PTR(target);
7906 target_end = (OnigUChar*)RSTRING_END(target);
7907 }
7908
7909 length_or_invalid = onigenc_ascii_only_case_map(flags,
7910 &source_current, source_end,
7911 target_current, target_end, enc);
7912 if (length_or_invalid < 0)
7913 rb_raise(rb_eArgError, "input string invalid");
7914 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7915 fprintf(stderr, "problem with rb_str_ascii_casemap"
7916 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7917 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7918 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7919 }
7920
7921 str_enc_copy(target, source);
7922
7923 return target;
7924}
7925
7926static bool
7927upcase_single(VALUE str)
7928{
7929 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7930 bool modified = false;
7931
7932 while (s < send) {
7933 unsigned int c = *(unsigned char*)s;
7934
7935 if ('a' <= c && c <= 'z') {
7936 *s = 'A' + (c - 'a');
7937 modified = true;
7938 }
7939 s++;
7940 }
7941 return modified;
7942}
7943
7944/*
7945 * call-seq:
7946 * upcase!(mapping) -> self or nil
7947 *
7948 * Like String#upcase, except that:
7949 *
7950 * - Changes character casings in +self+ (not in a copy of +self+).
7951 * - Returns +self+ if any changes are made, +nil+ otherwise.
7952 *
7953 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7954 */
7955
7956static VALUE
7957rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7958{
7959 rb_encoding *enc;
7960 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7961
7962 flags = check_case_options(argc, argv, flags);
7963 str_modify_keep_cr(str);
7964 enc = str_true_enc(str);
7965 if (case_option_single_p(flags, enc, str)) {
7966 if (upcase_single(str))
7967 flags |= ONIGENC_CASE_MODIFIED;
7968 }
7969 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7970 rb_str_ascii_casemap(str, str, &flags, enc);
7971 else
7972 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7973
7974 if (ONIGENC_CASE_MODIFIED&flags) return str;
7975 return Qnil;
7976}
7977
7978
7979/*
7980 * call-seq:
7981 * upcase(mapping = :ascii) -> new_string
7982 *
7983 * :include: doc/string/upcase.rdoc
7984 */
7985
7986static VALUE
7987rb_str_upcase(int argc, VALUE *argv, VALUE str)
7988{
7989 rb_encoding *enc;
7990 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7991 VALUE ret;
7992
7993 flags = check_case_options(argc, argv, flags);
7994 enc = str_true_enc(str);
7995 if (case_option_single_p(flags, enc, str)) {
7996 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7997 str_enc_copy_direct(ret, str);
7998 upcase_single(ret);
7999 }
8000 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8001 ret = rb_str_new(0, RSTRING_LEN(str));
8002 rb_str_ascii_casemap(str, ret, &flags, enc);
8003 }
8004 else {
8005 ret = rb_str_casemap(str, &flags, enc);
8006 }
8007
8008 return ret;
8009}
8010
8011static bool
8012downcase_single(VALUE str)
8013{
8014 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8015 bool modified = false;
8016
8017 while (s < send) {
8018 unsigned int c = *(unsigned char*)s;
8019
8020 if ('A' <= c && c <= 'Z') {
8021 *s = 'a' + (c - 'A');
8022 modified = true;
8023 }
8024 s++;
8025 }
8026
8027 return modified;
8028}
8029
8030/*
8031 * call-seq:
8032 * downcase!(mapping) -> self or nil
8033 *
8034 * Like String#downcase, except that:
8035 *
8036 * - Changes character casings in +self+ (not in a copy of +self+).
8037 * - Returns +self+ if any changes are made, +nil+ otherwise.
8038 *
8039 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8040 */
8041
8042static VALUE
8043rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8044{
8045 rb_encoding *enc;
8046 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8047
8048 flags = check_case_options(argc, argv, flags);
8049 str_modify_keep_cr(str);
8050 enc = str_true_enc(str);
8051 if (case_option_single_p(flags, enc, str)) {
8052 if (downcase_single(str))
8053 flags |= ONIGENC_CASE_MODIFIED;
8054 }
8055 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8056 rb_str_ascii_casemap(str, str, &flags, enc);
8057 else
8058 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8059
8060 if (ONIGENC_CASE_MODIFIED&flags) return str;
8061 return Qnil;
8062}
8063
8064
8065/*
8066 * call-seq:
8067 * downcase(mapping = :ascii) -> new_string
8068 *
8069 * :include: doc/string/downcase.rdoc
8070 *
8071 */
8072
8073static VALUE
8074rb_str_downcase(int argc, VALUE *argv, VALUE str)
8075{
8076 rb_encoding *enc;
8077 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8078 VALUE ret;
8079
8080 flags = check_case_options(argc, argv, flags);
8081 enc = str_true_enc(str);
8082 if (case_option_single_p(flags, enc, str)) {
8083 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8084 str_enc_copy_direct(ret, str);
8085 downcase_single(ret);
8086 }
8087 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8088 ret = rb_str_new(0, RSTRING_LEN(str));
8089 rb_str_ascii_casemap(str, ret, &flags, enc);
8090 }
8091 else {
8092 ret = rb_str_casemap(str, &flags, enc);
8093 }
8094
8095 return ret;
8096}
8097
8098
8099/*
8100 * call-seq:
8101 * capitalize!(mapping = :ascii) -> self or nil
8102 *
8103 * Like String#capitalize, except that:
8104 *
8105 * - Changes character casings in +self+ (not in a copy of +self+).
8106 * - Returns +self+ if any changes are made, +nil+ otherwise.
8107 *
8108 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8109 */
8110
8111static VALUE
8112rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8113{
8114 rb_encoding *enc;
8115 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8116
8117 flags = check_case_options(argc, argv, flags);
8118 str_modify_keep_cr(str);
8119 enc = str_true_enc(str);
8120 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8121 if (flags&ONIGENC_CASE_ASCII_ONLY)
8122 rb_str_ascii_casemap(str, str, &flags, enc);
8123 else
8124 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8125
8126 if (ONIGENC_CASE_MODIFIED&flags) return str;
8127 return Qnil;
8128}
8129
8130
8131/*
8132 * call-seq:
8133 * capitalize(mapping = :ascii) -> new_string
8134 *
8135 * :include: doc/string/capitalize.rdoc
8136 *
8137 */
8138
8139static VALUE
8140rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8141{
8142 rb_encoding *enc;
8143 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8144 VALUE ret;
8145
8146 flags = check_case_options(argc, argv, flags);
8147 enc = str_true_enc(str);
8148 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8149 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8150 ret = rb_str_new(0, RSTRING_LEN(str));
8151 rb_str_ascii_casemap(str, ret, &flags, enc);
8152 }
8153 else {
8154 ret = rb_str_casemap(str, &flags, enc);
8155 }
8156 return ret;
8157}
8158
8159
8160/*
8161 * call-seq:
8162 * swapcase!(mapping) -> self or nil
8163 *
8164 * Like String#swapcase, except that:
8165 *
8166 * - Changes are made to +self+, not to copy of +self+.
8167 * - Returns +self+ if any changes are made, +nil+ otherwise.
8168 *
8169 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8170 */
8171
8172static VALUE
8173rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8174{
8175 rb_encoding *enc;
8176 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8177
8178 flags = check_case_options(argc, argv, flags);
8179 str_modify_keep_cr(str);
8180 enc = str_true_enc(str);
8181 if (flags&ONIGENC_CASE_ASCII_ONLY)
8182 rb_str_ascii_casemap(str, str, &flags, enc);
8183 else
8184 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8185
8186 if (ONIGENC_CASE_MODIFIED&flags) return str;
8187 return Qnil;
8188}
8189
8190
8191/*
8192 * call-seq:
8193 * swapcase(mapping = :ascii) -> new_string
8194 *
8195 * :include: doc/string/swapcase.rdoc
8196 *
8197 */
8198
8199static VALUE
8200rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8201{
8202 rb_encoding *enc;
8203 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8204 VALUE ret;
8205
8206 flags = check_case_options(argc, argv, flags);
8207 enc = str_true_enc(str);
8208 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8209 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8210 ret = rb_str_new(0, RSTRING_LEN(str));
8211 rb_str_ascii_casemap(str, ret, &flags, enc);
8212 }
8213 else {
8214 ret = rb_str_casemap(str, &flags, enc);
8215 }
8216 return ret;
8217}
8218
8219typedef unsigned char *USTR;
8220
8221struct tr {
8222 int gen;
8223 unsigned int now, max;
8224 char *p, *pend;
8225};
8226
8227static unsigned int
8228trnext(struct tr *t, rb_encoding *enc)
8229{
8230 int n;
8231
8232 for (;;) {
8233 nextpart:
8234 if (!t->gen) {
8235 if (t->p == t->pend) return -1;
8236 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8237 t->p += n;
8238 }
8239 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8240 t->p += n;
8241 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8242 t->p += n;
8243 if (t->p < t->pend) {
8244 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8245 t->p += n;
8246 if (t->now > c) {
8247 if (t->now < 0x80 && c < 0x80) {
8248 rb_raise(rb_eArgError,
8249 "invalid range \"%c-%c\" in string transliteration",
8250 t->now, c);
8251 }
8252 else {
8253 rb_raise(rb_eArgError, "invalid range in string transliteration");
8254 }
8255 continue; /* not reached */
8256 }
8257 else if (t->now < c) {
8258 t->gen = 1;
8259 t->max = c;
8260 }
8261 }
8262 }
8263 return t->now;
8264 }
8265 else {
8266 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8267 if (t->now == t->max) {
8268 t->gen = 0;
8269 goto nextpart;
8270 }
8271 }
8272 if (t->now < t->max) {
8273 return t->now;
8274 }
8275 else {
8276 t->gen = 0;
8277 return t->max;
8278 }
8279 }
8280 }
8281}
8282
8283static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8284
8285static VALUE
8286tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8287{
8288 const unsigned int errc = -1;
8289 unsigned int trans[256];
8290 rb_encoding *enc, *e1, *e2;
8291 struct tr trsrc, trrepl;
8292 int cflag = 0;
8293 unsigned int c, c0, last = 0;
8294 int modify = 0, i, l;
8295 unsigned char *s, *send;
8296 VALUE hash = 0;
8297 int singlebyte = single_byte_optimizable(str);
8298 int termlen;
8299 int cr;
8300
8301#define CHECK_IF_ASCII(c) \
8302 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8303 (cr = ENC_CODERANGE_VALID) : 0)
8304
8305 StringValue(src);
8306 StringValue(repl);
8307 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8308 if (RSTRING_LEN(repl) == 0) {
8309 return rb_str_delete_bang(1, &src, str);
8310 }
8311
8312 cr = ENC_CODERANGE(str);
8313 e1 = rb_enc_check(str, src);
8314 e2 = rb_enc_check(str, repl);
8315 if (e1 == e2) {
8316 enc = e1;
8317 }
8318 else {
8319 enc = rb_enc_check(src, repl);
8320 }
8321 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8322 if (RSTRING_LEN(src) > 1 &&
8323 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8324 trsrc.p + l < trsrc.pend) {
8325 cflag = 1;
8326 trsrc.p += l;
8327 }
8328 trrepl.p = RSTRING_PTR(repl);
8329 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8330 trsrc.gen = trrepl.gen = 0;
8331 trsrc.now = trrepl.now = 0;
8332 trsrc.max = trrepl.max = 0;
8333
8334 if (cflag) {
8335 for (i=0; i<256; i++) {
8336 trans[i] = 1;
8337 }
8338 while ((c = trnext(&trsrc, enc)) != errc) {
8339 if (c < 256) {
8340 trans[c] = errc;
8341 }
8342 else {
8343 if (!hash) hash = rb_hash_new();
8344 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8345 }
8346 }
8347 while ((c = trnext(&trrepl, enc)) != errc)
8348 /* retrieve last replacer */;
8349 last = trrepl.now;
8350 for (i=0; i<256; i++) {
8351 if (trans[i] != errc) {
8352 trans[i] = last;
8353 }
8354 }
8355 }
8356 else {
8357 unsigned int r;
8358
8359 for (i=0; i<256; i++) {
8360 trans[i] = errc;
8361 }
8362 while ((c = trnext(&trsrc, enc)) != errc) {
8363 r = trnext(&trrepl, enc);
8364 if (r == errc) r = trrepl.now;
8365 if (c < 256) {
8366 trans[c] = r;
8367 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8368 }
8369 else {
8370 if (!hash) hash = rb_hash_new();
8371 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8372 }
8373 }
8374 }
8375
8376 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8377 cr = ENC_CODERANGE_7BIT;
8378 str_modify_keep_cr(str);
8379 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8380 termlen = rb_enc_mbminlen(enc);
8381 if (sflag) {
8382 int clen, tlen;
8383 long offset, max = RSTRING_LEN(str);
8384 unsigned int save = -1;
8385 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8386
8387 while (s < send) {
8388 int may_modify = 0;
8389
8390 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8391 if (!MBCLEN_CHARFOUND_P(r)) {
8392 xfree(buf);
8393 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8394 }
8395 clen = MBCLEN_CHARFOUND_LEN(r);
8396 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8397
8398 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8399
8400 s += clen;
8401 if (c < 256) {
8402 c = trans[c];
8403 }
8404 else if (hash) {
8405 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8406 if (NIL_P(tmp)) {
8407 if (cflag) c = last;
8408 else c = errc;
8409 }
8410 else if (cflag) c = errc;
8411 else c = NUM2INT(tmp);
8412 }
8413 else {
8414 c = errc;
8415 }
8416 if (c != (unsigned int)-1) {
8417 if (save == c) {
8418 CHECK_IF_ASCII(c);
8419 continue;
8420 }
8421 save = c;
8422 tlen = rb_enc_codelen(c, enc);
8423 modify = 1;
8424 }
8425 else {
8426 save = -1;
8427 c = c0;
8428 if (enc != e1) may_modify = 1;
8429 }
8430 if ((offset = t - buf) + tlen > max) {
8431 size_t MAYBE_UNUSED(old) = max + termlen;
8432 max = offset + tlen + (send - s);
8433 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8434 t = buf + offset;
8435 }
8436 rb_enc_mbcput(c, t, enc);
8437 if (may_modify && memcmp(s, t, tlen) != 0) {
8438 modify = 1;
8439 }
8440 CHECK_IF_ASCII(c);
8441 t += tlen;
8442 }
8443 if (!STR_EMBED_P(str)) {
8444 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8445 }
8446 TERM_FILL((char *)t, termlen);
8447 RSTRING(str)->as.heap.ptr = (char *)buf;
8448 STR_SET_LEN(str, t - buf);
8449 STR_SET_NOEMBED(str);
8450 RSTRING(str)->as.heap.aux.capa = max;
8451 }
8452 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8453 while (s < send) {
8454 c = (unsigned char)*s;
8455 if (trans[c] != errc) {
8456 if (!cflag) {
8457 c = trans[c];
8458 *s = c;
8459 modify = 1;
8460 }
8461 else {
8462 *s = last;
8463 modify = 1;
8464 }
8465 }
8466 CHECK_IF_ASCII(c);
8467 s++;
8468 }
8469 }
8470 else {
8471 int clen, tlen;
8472 long offset, max = (long)((send - s) * 1.2);
8473 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8474
8475 while (s < send) {
8476 int may_modify = 0;
8477
8478 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8479 if (!MBCLEN_CHARFOUND_P(r)) {
8480 xfree(buf);
8481 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8482 }
8483 clen = MBCLEN_CHARFOUND_LEN(r);
8484 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8485
8486 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8487
8488 if (c < 256) {
8489 c = trans[c];
8490 }
8491 else if (hash) {
8492 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8493 if (NIL_P(tmp)) {
8494 if (cflag) c = last;
8495 else c = errc;
8496 }
8497 else if (cflag) c = errc;
8498 else c = NUM2INT(tmp);
8499 }
8500 else {
8501 c = cflag ? last : errc;
8502 }
8503 if (c != errc) {
8504 tlen = rb_enc_codelen(c, enc);
8505 modify = 1;
8506 }
8507 else {
8508 c = c0;
8509 if (enc != e1) may_modify = 1;
8510 }
8511 if ((offset = t - buf) + tlen > max) {
8512 size_t MAYBE_UNUSED(old) = max + termlen;
8513 max = offset + tlen + (long)((send - s) * 1.2);
8514 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8515 t = buf + offset;
8516 }
8517 if (s != t) {
8518 rb_enc_mbcput(c, t, enc);
8519 if (may_modify && memcmp(s, t, tlen) != 0) {
8520 modify = 1;
8521 }
8522 }
8523 CHECK_IF_ASCII(c);
8524 s += clen;
8525 t += tlen;
8526 }
8527 if (!STR_EMBED_P(str)) {
8528 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8529 }
8530 TERM_FILL((char *)t, termlen);
8531 RSTRING(str)->as.heap.ptr = (char *)buf;
8532 STR_SET_LEN(str, t - buf);
8533 STR_SET_NOEMBED(str);
8534 RSTRING(str)->as.heap.aux.capa = max;
8535 }
8536
8537 if (modify) {
8538 if (cr != ENC_CODERANGE_BROKEN)
8539 ENC_CODERANGE_SET(str, cr);
8540 rb_enc_associate(str, enc);
8541 return str;
8542 }
8543 return Qnil;
8544}
8545
8546
8547/*
8548 * call-seq:
8549 * tr!(selector, replacements) -> self or nil
8550 *
8551 * Like String#tr, except:
8552 *
8553 * - Performs substitutions in +self+ (not in a copy of +self+).
8554 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8555 *
8556 * Related: {Modifying}[rdoc-ref:String@Modifying].
8557 */
8558
8559static VALUE
8560rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8561{
8562 return tr_trans(str, src, repl, 0);
8563}
8564
8565
8566/*
8567 * call-seq:
8568 * tr(selector, replacements) -> new_string
8569 *
8570 * Returns a copy of +self+ with each character specified by string +selector+
8571 * translated to the corresponding character in string +replacements+.
8572 * The correspondence is _positional_:
8573 *
8574 * - Each occurrence of the first character specified by +selector+
8575 * is translated to the first character in +replacements+.
8576 * - Each occurrence of the second character specified by +selector+
8577 * is translated to the second character in +replacements+.
8578 * - And so on.
8579 *
8580 * Example:
8581 *
8582 * 'hello'.tr('el', 'ip') #=> "hippo"
8583 *
8584 * If +replacements+ is shorter than +selector+,
8585 * it is implicitly padded with its own last character:
8586 *
8587 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8588 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8589 *
8590 * Arguments +selector+ and +replacements+ must be valid character selectors
8591 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8592 * and may use any of its valid forms, including negation, ranges, and escapes:
8593 *
8594 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8595 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8596 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8597 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8598 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8599 *
8600 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8601 */
8602
8603static VALUE
8604rb_str_tr(VALUE str, VALUE src, VALUE repl)
8605{
8606 str = str_duplicate(rb_cString, str);
8607 tr_trans(str, src, repl, 0);
8608 return str;
8609}
8610
8611#define TR_TABLE_MAX (UCHAR_MAX+1)
8612#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8613static void
8614tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8615 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8616{
8617 const unsigned int errc = -1;
8618 char buf[TR_TABLE_MAX];
8619 struct tr tr;
8620 unsigned int c;
8621 VALUE table = 0, ptable = 0;
8622 int i, l, cflag = 0;
8623
8624 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8625 tr.gen = tr.now = tr.max = 0;
8626
8627 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8628 cflag = 1;
8629 tr.p += l;
8630 }
8631 if (first) {
8632 for (i=0; i<TR_TABLE_MAX; i++) {
8633 stable[i] = 1;
8634 }
8635 stable[TR_TABLE_MAX] = cflag;
8636 }
8637 else if (stable[TR_TABLE_MAX] && !cflag) {
8638 stable[TR_TABLE_MAX] = 0;
8639 }
8640 for (i=0; i<TR_TABLE_MAX; i++) {
8641 buf[i] = cflag;
8642 }
8643
8644 while ((c = trnext(&tr, enc)) != errc) {
8645 if (c < TR_TABLE_MAX) {
8646 buf[(unsigned char)c] = !cflag;
8647 }
8648 else {
8649 VALUE key = UINT2NUM(c);
8650
8651 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8652 if (cflag) {
8653 ptable = *ctablep;
8654 table = ptable ? ptable : rb_hash_new();
8655 *ctablep = table;
8656 }
8657 else {
8658 table = rb_hash_new();
8659 ptable = *tablep;
8660 *tablep = table;
8661 }
8662 }
8663 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8664 rb_hash_aset(table, key, Qtrue);
8665 }
8666 }
8667 }
8668 for (i=0; i<TR_TABLE_MAX; i++) {
8669 stable[i] = stable[i] && buf[i];
8670 }
8671 if (!table && !cflag) {
8672 *tablep = 0;
8673 }
8674}
8675
8676
8677static int
8678tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8679{
8680 if (c < TR_TABLE_MAX) {
8681 return table[c] != 0;
8682 }
8683 else {
8684 VALUE v = UINT2NUM(c);
8685
8686 if (del) {
8687 if (!NIL_P(rb_hash_lookup(del, v)) &&
8688 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8689 return TRUE;
8690 }
8691 }
8692 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8693 return FALSE;
8694 }
8695 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8696 }
8697}
8698
8699/*
8700 * call-seq:
8701 * delete!(*selectors) -> self or nil
8702 *
8703 * Like String#delete, but modifies +self+ in place;
8704 * returns +self+ if any characters were deleted, +nil+ otherwise.
8705 *
8706 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8707 */
8708
8709static VALUE
8710rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8711{
8712 char squeez[TR_TABLE_SIZE];
8713 rb_encoding *enc = 0;
8714 char *s, *send, *t;
8715 VALUE del = 0, nodel = 0;
8716 int modify = 0;
8717 int i, ascompat, cr;
8718
8719 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8721 for (i=0; i<argc; i++) {
8722 VALUE s = argv[i];
8723
8724 StringValue(s);
8725 enc = rb_enc_check(str, s);
8726 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8727 }
8728
8729 str_modify_keep_cr(str);
8730 ascompat = rb_enc_asciicompat(enc);
8731 s = t = RSTRING_PTR(str);
8732 send = RSTRING_END(str);
8733 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8734 while (s < send) {
8735 unsigned int c;
8736 int clen;
8737
8738 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8739 if (squeez[c]) {
8740 modify = 1;
8741 }
8742 else {
8743 if (t != s) *t = c;
8744 t++;
8745 }
8746 s++;
8747 }
8748 else {
8749 c = rb_enc_codepoint_len(s, send, &clen, enc);
8750
8751 if (tr_find(c, squeez, del, nodel)) {
8752 modify = 1;
8753 }
8754 else {
8755 if (t != s) rb_enc_mbcput(c, t, enc);
8756 t += clen;
8758 }
8759 s += clen;
8760 }
8761 }
8762 TERM_FILL(t, TERM_LEN(str));
8763 STR_SET_LEN(str, t - RSTRING_PTR(str));
8764 ENC_CODERANGE_SET(str, cr);
8765
8766 if (modify) return str;
8767 return Qnil;
8768}
8769
8770
8771/*
8772 * call-seq:
8773 * delete(*selectors) -> new_string
8774 *
8775 * :include: doc/string/delete.rdoc
8776 *
8777 */
8778
8779static VALUE
8780rb_str_delete(int argc, VALUE *argv, VALUE str)
8781{
8782 str = str_duplicate(rb_cString, str);
8783 rb_str_delete_bang(argc, argv, str);
8784 return str;
8785}
8786
8787
8788/*
8789 * call-seq:
8790 * squeeze!(*selectors) -> self or nil
8791 *
8792 * Like String#squeeze, except that:
8793 *
8794 * - Characters are squeezed in +self+ (not in a copy of +self+).
8795 * - Returns +self+ if any changes are made, +nil+ otherwise.
8796 *
8797 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8798 */
8799
8800static VALUE
8801rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8802{
8803 char squeez[TR_TABLE_SIZE];
8804 rb_encoding *enc = 0;
8805 VALUE del = 0, nodel = 0;
8806 unsigned char *s, *send, *t;
8807 int i, modify = 0;
8808 int ascompat, singlebyte = single_byte_optimizable(str);
8809 unsigned int save;
8810
8811 if (argc == 0) {
8812 enc = STR_ENC_GET(str);
8813 }
8814 else {
8815 for (i=0; i<argc; i++) {
8816 VALUE s = argv[i];
8817
8818 StringValue(s);
8819 enc = rb_enc_check(str, s);
8820 if (singlebyte && !single_byte_optimizable(s))
8821 singlebyte = 0;
8822 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8823 }
8824 }
8825
8826 str_modify_keep_cr(str);
8827 s = t = (unsigned char *)RSTRING_PTR(str);
8828 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8829 send = (unsigned char *)RSTRING_END(str);
8830 save = -1;
8831 ascompat = rb_enc_asciicompat(enc);
8832
8833 if (singlebyte) {
8834 while (s < send) {
8835 unsigned int c = *s++;
8836 if (c != save || (argc > 0 && !squeez[c])) {
8837 *t++ = save = c;
8838 }
8839 }
8840 }
8841 else {
8842 while (s < send) {
8843 unsigned int c;
8844 int clen;
8845
8846 if (ascompat && (c = *s) < 0x80) {
8847 if (c != save || (argc > 0 && !squeez[c])) {
8848 *t++ = save = c;
8849 }
8850 s++;
8851 }
8852 else {
8853 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8854
8855 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8856 if (t != s) rb_enc_mbcput(c, t, enc);
8857 save = c;
8858 t += clen;
8859 }
8860 s += clen;
8861 }
8862 }
8863 }
8864
8865 TERM_FILL((char *)t, TERM_LEN(str));
8866 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8867 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8868 modify = 1;
8869 }
8870
8871 if (modify) return str;
8872 return Qnil;
8873}
8874
8875
8876/*
8877 * call-seq:
8878 * squeeze(*selectors) -> new_string
8879 *
8880 * :include: doc/string/squeeze.rdoc
8881 *
8882 */
8883
8884static VALUE
8885rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8886{
8887 str = str_duplicate(rb_cString, str);
8888 rb_str_squeeze_bang(argc, argv, str);
8889 return str;
8890}
8891
8892
8893/*
8894 * call-seq:
8895 * tr_s!(selector, replacements) -> self or nil
8896 *
8897 * Like String#tr_s, except:
8898 *
8899 * - Modifies +self+ in place (not a copy of +self+).
8900 * - Returns +self+ if any changes were made, +nil+ otherwise.
8901 *
8902 * Related: {Modifying}[rdoc-ref:String@Modifying].
8903 */
8904
8905static VALUE
8906rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8907{
8908 return tr_trans(str, src, repl, 1);
8909}
8910
8911
8912/*
8913 * call-seq:
8914 * tr_s(selector, replacements) -> new_string
8915 *
8916 * Like String#tr, except:
8917 *
8918 * - Also squeezes the modified portions of the translated string;
8919 * see String#squeeze.
8920 * - Returns the translated and squeezed string.
8921 *
8922 * Examples:
8923 *
8924 * 'hello'.tr_s('l', 'r') #=> "hero"
8925 * 'hello'.tr_s('el', '-') #=> "h-o"
8926 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8927 *
8928 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8929 *
8930 */
8931
8932static VALUE
8933rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8934{
8935 str = str_duplicate(rb_cString, str);
8936 tr_trans(str, src, repl, 1);
8937 return str;
8938}
8939
8940
8941/*
8942 * call-seq:
8943 * count(*selectors) -> integer
8944 *
8945 * :include: doc/string/count.rdoc
8946 */
8947
8948static VALUE
8949rb_str_count(int argc, VALUE *argv, VALUE str)
8950{
8951 char table[TR_TABLE_SIZE];
8952 rb_encoding *enc = 0;
8953 VALUE del = 0, nodel = 0, tstr;
8954 char *s, *send;
8955 int i;
8956 int ascompat;
8957 size_t n = 0;
8958
8960
8961 tstr = argv[0];
8962 StringValue(tstr);
8963 enc = rb_enc_check(str, tstr);
8964 if (argc == 1) {
8965 const char *ptstr;
8966 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8967 (ptstr = RSTRING_PTR(tstr),
8968 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8969 !is_broken_string(str)) {
8970 int clen;
8971 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8972
8973 s = RSTRING_PTR(str);
8974 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8975 send = RSTRING_END(str);
8976 while (s < send) {
8977 if (*(unsigned char*)s++ == c) n++;
8978 }
8979 return SIZET2NUM(n);
8980 }
8981 }
8982
8983 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8984 for (i=1; i<argc; i++) {
8985 tstr = argv[i];
8986 StringValue(tstr);
8987 enc = rb_enc_check(str, tstr);
8988 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8989 }
8990
8991 s = RSTRING_PTR(str);
8992 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8993 send = RSTRING_END(str);
8994 ascompat = rb_enc_asciicompat(enc);
8995 while (s < send) {
8996 unsigned int c;
8997
8998 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8999 if (table[c]) {
9000 n++;
9001 }
9002 s++;
9003 }
9004 else {
9005 int clen;
9006 c = rb_enc_codepoint_len(s, send, &clen, enc);
9007 if (tr_find(c, table, del, nodel)) {
9008 n++;
9009 }
9010 s += clen;
9011 }
9012 }
9013
9014 return SIZET2NUM(n);
9015}
9016
9017static VALUE
9018rb_fs_check(VALUE val)
9019{
9020 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9021 val = rb_check_string_type(val);
9022 if (NIL_P(val)) return 0;
9023 }
9024 return val;
9025}
9026
9027static const char isspacetable[256] = {
9028 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9030 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9039 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9044};
9045
9046#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9047
9048static long
9049split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9050{
9051 if (empty_count >= 0 && len == 0) {
9052 return empty_count + 1;
9053 }
9054 if (empty_count > 0) {
9055 /* make different substrings */
9056 if (result) {
9057 do {
9058 rb_ary_push(result, str_new_empty_String(str));
9059 } while (--empty_count > 0);
9060 }
9061 else {
9062 do {
9063 rb_yield(str_new_empty_String(str));
9064 } while (--empty_count > 0);
9065 }
9066 }
9067 str = rb_str_subseq(str, beg, len);
9068 if (result) {
9069 rb_ary_push(result, str);
9070 }
9071 else {
9072 rb_yield(str);
9073 }
9074 return empty_count;
9075}
9076
9077typedef enum {
9078 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9079} split_type_t;
9080
9081static split_type_t
9082literal_split_pattern(VALUE spat, split_type_t default_type)
9083{
9084 rb_encoding *enc = STR_ENC_GET(spat);
9085 const char *ptr;
9086 long len;
9087 RSTRING_GETMEM(spat, ptr, len);
9088 if (len == 0) {
9089 /* Special case - split into chars */
9090 return SPLIT_TYPE_CHARS;
9091 }
9092 else if (rb_enc_asciicompat(enc)) {
9093 if (len == 1 && ptr[0] == ' ') {
9094 return SPLIT_TYPE_AWK;
9095 }
9096 }
9097 else {
9098 int l;
9099 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9100 return SPLIT_TYPE_AWK;
9101 }
9102 }
9103 return default_type;
9104}
9105
9106/*
9107 * call-seq:
9108 * split(field_sep = $;, limit = 0) -> array_of_substrings
9109 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9110 *
9111 * :include: doc/string/split.rdoc
9112 *
9113 */
9114
9115static VALUE
9116rb_str_split_m(int argc, VALUE *argv, VALUE str)
9117{
9118 rb_encoding *enc;
9119 VALUE spat;
9120 VALUE limit;
9121 split_type_t split_type;
9122 long beg, end, i = 0, empty_count = -1;
9123 int lim = 0;
9124 VALUE result, tmp;
9125
9126 result = rb_block_given_p() ? Qfalse : Qnil;
9127 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9128 lim = NUM2INT(limit);
9129 if (lim <= 0) limit = Qnil;
9130 else if (lim == 1) {
9131 if (RSTRING_LEN(str) == 0)
9132 return result ? rb_ary_new2(0) : str;
9133 tmp = str_duplicate(rb_cString, str);
9134 if (!result) {
9135 rb_yield(tmp);
9136 return str;
9137 }
9138 return rb_ary_new3(1, tmp);
9139 }
9140 i = 1;
9141 }
9142 if (NIL_P(limit) && !lim) empty_count = 0;
9143
9144 enc = STR_ENC_GET(str);
9145 split_type = SPLIT_TYPE_REGEXP;
9146 if (!NIL_P(spat)) {
9147 spat = get_pat_quoted(spat, 0);
9148 }
9149 else if (NIL_P(spat = rb_fs)) {
9150 split_type = SPLIT_TYPE_AWK;
9151 }
9152 else if (!(spat = rb_fs_check(spat))) {
9153 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9154 }
9155 else {
9156 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9157 }
9158 if (split_type != SPLIT_TYPE_AWK) {
9159 switch (BUILTIN_TYPE(spat)) {
9160 case T_REGEXP:
9161 rb_reg_options(spat); /* check if uninitialized */
9162 tmp = RREGEXP_SRC(spat);
9163 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9164 if (split_type == SPLIT_TYPE_AWK) {
9165 spat = tmp;
9166 split_type = SPLIT_TYPE_STRING;
9167 }
9168 break;
9169
9170 case T_STRING:
9171 mustnot_broken(spat);
9172 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9173 break;
9174
9175 default:
9177 }
9178 }
9179
9180#define SPLIT_STR(beg, len) ( \
9181 empty_count = split_string(result, str, beg, len, empty_count), \
9182 str_mod_check(str, str_start, str_len))
9183
9184 beg = 0;
9185 char *ptr = RSTRING_PTR(str);
9186 char *const str_start = ptr;
9187 const long str_len = RSTRING_LEN(str);
9188 char *const eptr = str_start + str_len;
9189 if (split_type == SPLIT_TYPE_AWK) {
9190 char *bptr = ptr;
9191 int skip = 1;
9192 unsigned int c;
9193
9194 if (result) result = rb_ary_new();
9195 end = beg;
9196 if (is_ascii_string(str)) {
9197 while (ptr < eptr) {
9198 c = (unsigned char)*ptr++;
9199 if (skip) {
9200 if (ascii_isspace(c)) {
9201 beg = ptr - bptr;
9202 }
9203 else {
9204 end = ptr - bptr;
9205 skip = 0;
9206 if (!NIL_P(limit) && lim <= i) break;
9207 }
9208 }
9209 else if (ascii_isspace(c)) {
9210 SPLIT_STR(beg, end-beg);
9211 skip = 1;
9212 beg = ptr - bptr;
9213 if (!NIL_P(limit)) ++i;
9214 }
9215 else {
9216 end = ptr - bptr;
9217 }
9218 }
9219 }
9220 else {
9221 while (ptr < eptr) {
9222 int n;
9223
9224 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9225 ptr += n;
9226 if (skip) {
9227 if (rb_isspace(c)) {
9228 beg = ptr - bptr;
9229 }
9230 else {
9231 end = ptr - bptr;
9232 skip = 0;
9233 if (!NIL_P(limit) && lim <= i) break;
9234 }
9235 }
9236 else if (rb_isspace(c)) {
9237 SPLIT_STR(beg, end-beg);
9238 skip = 1;
9239 beg = ptr - bptr;
9240 if (!NIL_P(limit)) ++i;
9241 }
9242 else {
9243 end = ptr - bptr;
9244 }
9245 }
9246 }
9247 }
9248 else if (split_type == SPLIT_TYPE_STRING) {
9249 char *substr_start = ptr;
9250 char *sptr = RSTRING_PTR(spat);
9251 long slen = RSTRING_LEN(spat);
9252
9253 if (result) result = rb_ary_new();
9254 mustnot_broken(str);
9255 enc = rb_enc_check(str, spat);
9256 while (ptr < eptr &&
9257 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9258 /* Check we are at the start of a char */
9259 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9260 if (t != ptr + end) {
9261 ptr = t;
9262 continue;
9263 }
9264 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9265 str_mod_check(spat, sptr, slen);
9266 ptr += end + slen;
9267 substr_start = ptr;
9268 if (!NIL_P(limit) && lim <= ++i) break;
9269 }
9270 beg = ptr - str_start;
9271 }
9272 else if (split_type == SPLIT_TYPE_CHARS) {
9273 int n;
9274
9275 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9276 mustnot_broken(str);
9277 enc = rb_enc_get(str);
9278 while (ptr < eptr &&
9279 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9280 SPLIT_STR(ptr - str_start, n);
9281 ptr += n;
9282 if (!NIL_P(limit) && lim <= ++i) break;
9283 }
9284 beg = ptr - str_start;
9285 }
9286 else {
9287 if (result) result = rb_ary_new();
9288 long len = RSTRING_LEN(str);
9289 long start = beg;
9290 long idx;
9291 int last_null = 0;
9292 struct re_registers *regs;
9293 VALUE match = 0;
9294
9295 for (; rb_reg_search(spat, str, start, 0) >= 0;
9296 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9297 match = rb_backref_get();
9298 if (!result) rb_match_busy(match);
9299 regs = RMATCH_REGS(match);
9300 end = BEG(0);
9301 if (start == end && BEG(0) == END(0)) {
9302 if (!ptr) {
9303 SPLIT_STR(0, 0);
9304 break;
9305 }
9306 else if (last_null == 1) {
9307 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9308 beg = start;
9309 }
9310 else {
9311 if (start == len)
9312 start++;
9313 else
9314 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9315 last_null = 1;
9316 continue;
9317 }
9318 }
9319 else {
9320 SPLIT_STR(beg, end-beg);
9321 beg = start = END(0);
9322 }
9323 last_null = 0;
9324
9325 for (idx=1; idx < regs->num_regs; idx++) {
9326 if (BEG(idx) == -1) continue;
9327 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9328 }
9329 if (!NIL_P(limit) && lim <= ++i) break;
9330 }
9331 if (match) rb_match_unbusy(match);
9332 }
9333 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9334 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9335 }
9336
9337 return result ? result : str;
9338}
9339
9340VALUE
9341rb_str_split(VALUE str, const char *sep0)
9342{
9343 VALUE sep;
9344
9345 StringValue(str);
9346 sep = rb_str_new_cstr(sep0);
9347 return rb_str_split_m(1, &sep, str);
9348}
9349
9350#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9351
9352static inline int
9353enumerator_element(VALUE ary, VALUE e)
9354{
9355 if (ary) {
9356 rb_ary_push(ary, e);
9357 return 0;
9358 }
9359 else {
9360 rb_yield(e);
9361 return 1;
9362 }
9363}
9364
9365#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9366
9367static const char *
9368chomp_newline(const char *p, const char *e, rb_encoding *enc)
9369{
9370 const char *prev = rb_enc_prev_char(p, e, e, enc);
9371 if (rb_enc_is_newline(prev, e, enc)) {
9372 e = prev;
9373 prev = rb_enc_prev_char(p, e, e, enc);
9374 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9375 e = prev;
9376 }
9377 return e;
9378}
9379
9380static VALUE
9381get_rs(void)
9382{
9383 VALUE rs = rb_rs;
9384 if (!NIL_P(rs) &&
9385 (!RB_TYPE_P(rs, T_STRING) ||
9386 RSTRING_LEN(rs) != 1 ||
9387 RSTRING_PTR(rs)[0] != '\n')) {
9388 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9389 }
9390 return rs;
9391}
9392
9393#define rb_rs get_rs()
9394
9395static VALUE
9396rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9397{
9398 rb_encoding *enc;
9399 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9400 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9401 long pos, len, rslen;
9402 int rsnewline = 0;
9403
9404 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9405 rs = rb_rs;
9406 if (!NIL_P(opts)) {
9407 static ID keywords[1];
9408 if (!keywords[0]) {
9409 keywords[0] = rb_intern_const("chomp");
9410 }
9411 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9412 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9413 }
9414
9415 if (NIL_P(rs)) {
9416 if (!ENUM_ELEM(ary, str)) {
9417 return ary;
9418 }
9419 else {
9420 return orig;
9421 }
9422 }
9423
9424 if (!RSTRING_LEN(str)) goto end;
9425 str = rb_str_new_frozen(str);
9426 ptr = subptr = RSTRING_PTR(str);
9427 pend = RSTRING_END(str);
9428 len = RSTRING_LEN(str);
9429 StringValue(rs);
9430 rslen = RSTRING_LEN(rs);
9431
9432 if (rs == rb_default_rs)
9433 enc = rb_enc_get(str);
9434 else
9435 enc = rb_enc_check(str, rs);
9436
9437 if (rslen == 0) {
9438 /* paragraph mode */
9439 int n;
9440 const char *eol = NULL;
9441 subend = subptr;
9442 while (subend < pend) {
9443 long chomp_rslen = 0;
9444 do {
9445 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9446 n = 0;
9447 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9448 if (rb_enc_is_newline(subend + n, pend, enc)) {
9449 if (eol == subend) break;
9450 subend += rslen;
9451 if (subptr) {
9452 eol = subend;
9453 chomp_rslen = -rslen;
9454 }
9455 }
9456 else {
9457 if (!subptr) subptr = subend;
9458 subend += rslen;
9459 }
9460 rslen = 0;
9461 } while (subend < pend);
9462 if (!subptr) break;
9463 if (rslen == 0) chomp_rslen = 0;
9464 line = rb_str_subseq(str, subptr - ptr,
9465 subend - subptr + (chomp ? chomp_rslen : rslen));
9466 if (ENUM_ELEM(ary, line)) {
9467 str_mod_check(str, ptr, len);
9468 }
9469 subptr = eol = NULL;
9470 }
9471 goto end;
9472 }
9473 else {
9474 rsptr = RSTRING_PTR(rs);
9475 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9476 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9477 rsnewline = 1;
9478 }
9479 }
9480
9481 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9482 rs = rb_str_new(rsptr, rslen);
9483 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9484 rsptr = RSTRING_PTR(rs);
9485 rslen = RSTRING_LEN(rs);
9486 }
9487
9488 while (subptr < pend) {
9489 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9490 if (pos < 0) break;
9491 hit = subptr + pos;
9492 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9493 if (hit != adjusted) {
9494 subptr = adjusted;
9495 continue;
9496 }
9497 subend = hit += rslen;
9498 if (chomp) {
9499 if (rsnewline) {
9500 subend = chomp_newline(subptr, subend, enc);
9501 }
9502 else {
9503 subend -= rslen;
9504 }
9505 }
9506 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9507 if (ENUM_ELEM(ary, line)) {
9508 str_mod_check(str, ptr, len);
9509 }
9510 subptr = hit;
9511 }
9512
9513 if (subptr != pend) {
9514 if (chomp) {
9515 if (rsnewline) {
9516 pend = chomp_newline(subptr, pend, enc);
9517 }
9518 else if (pend - subptr >= rslen &&
9519 memcmp(pend - rslen, rsptr, rslen) == 0) {
9520 pend -= rslen;
9521 }
9522 }
9523 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9524 ENUM_ELEM(ary, line);
9525 RB_GC_GUARD(str);
9526 }
9527
9528 end:
9529 if (ary)
9530 return ary;
9531 else
9532 return orig;
9533}
9534
9535/*
9536 * call-seq:
9537 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9538 * each_line(record_separator = $/, chomp: false) -> enumerator
9539 *
9540 * :include: doc/string/each_line.rdoc
9541 *
9542 */
9543
9544static VALUE
9545rb_str_each_line(int argc, VALUE *argv, VALUE str)
9546{
9547 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9548 return rb_str_enumerate_lines(argc, argv, str, 0);
9549}
9550
9551/*
9552 * call-seq:
9553 * lines(record_separator = $/, chomp: false) -> array_of_strings
9554 *
9555 * Returns substrings ("lines") of +self+
9556 * according to the given arguments:
9557 *
9558 * s = <<~EOT
9559 * This is the first line.
9560 * This is line two.
9561 *
9562 * This is line four.
9563 * This is line five.
9564 * EOT
9565 *
9566 * With the default argument values:
9567 *
9568 * $/ # => "\n"
9569 * s.lines
9570 * # =>
9571 * ["This is the first line.\n",
9572 * "This is line two.\n",
9573 * "\n",
9574 * "This is line four.\n",
9575 * "This is line five.\n"]
9576 *
9577 * With a different +record_separator+:
9578 *
9579 * record_separator = ' is '
9580 * s.lines(record_separator)
9581 * # =>
9582 * ["This is ",
9583 * "the first line.\nThis is ",
9584 * "line two.\n\nThis is ",
9585 * "line four.\nThis is ",
9586 * "line five.\n"]
9587 *
9588 * With keyword argument +chomp+ as +true+,
9589 * removes the trailing newline from each line:
9590 *
9591 * s.lines(chomp: true)
9592 * # =>
9593 * ["This is the first line.",
9594 * "This is line two.",
9595 * "",
9596 * "This is line four.",
9597 * "This is line five."]
9598 *
9599 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
9600 */
9601
9602static VALUE
9603rb_str_lines(int argc, VALUE *argv, VALUE str)
9604{
9605 VALUE ary = WANTARRAY("lines", 0);
9606 return rb_str_enumerate_lines(argc, argv, str, ary);
9607}
9608
9609static VALUE
9610rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9611{
9612 return LONG2FIX(RSTRING_LEN(str));
9613}
9614
9615static VALUE
9616rb_str_enumerate_bytes(VALUE str, VALUE ary)
9617{
9618 long i;
9619
9620 for (i=0; i<RSTRING_LEN(str); i++) {
9621 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9622 }
9623 if (ary)
9624 return ary;
9625 else
9626 return str;
9627}
9628
9629/*
9630 * call-seq:
9631 * each_byte {|byte| ... } -> self
9632 * each_byte -> enumerator
9633 *
9634 * :include: doc/string/each_byte.rdoc
9635 *
9636 */
9637
9638static VALUE
9639rb_str_each_byte(VALUE str)
9640{
9641 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9642 return rb_str_enumerate_bytes(str, 0);
9643}
9644
9645/*
9646 * call-seq:
9647 * bytes -> array_of_bytes
9648 *
9649 * :include: doc/string/bytes.rdoc
9650 *
9651 */
9652
9653static VALUE
9654rb_str_bytes(VALUE str)
9655{
9656 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9657 return rb_str_enumerate_bytes(str, ary);
9658}
9659
9660static VALUE
9661rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9662{
9663 return rb_str_length(str);
9664}
9665
9666static VALUE
9667rb_str_enumerate_chars(VALUE str, VALUE ary)
9668{
9669 VALUE orig = str;
9670 long i, len, n;
9671 const char *ptr;
9672 rb_encoding *enc;
9673
9674 str = rb_str_new_frozen(str);
9675 ptr = RSTRING_PTR(str);
9676 len = RSTRING_LEN(str);
9677 enc = rb_enc_get(str);
9678
9680 for (i = 0; i < len; i += n) {
9681 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9682 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9683 }
9684 }
9685 else {
9686 for (i = 0; i < len; i += n) {
9687 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9688 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9689 }
9690 }
9691 RB_GC_GUARD(str);
9692 if (ary)
9693 return ary;
9694 else
9695 return orig;
9696}
9697
9698/*
9699 * call-seq:
9700 * each_char {|char| ... } -> self
9701 * each_char -> enumerator
9702 *
9703 * :include: doc/string/each_char.rdoc
9704 *
9705 */
9706
9707static VALUE
9708rb_str_each_char(VALUE str)
9709{
9710 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9711 return rb_str_enumerate_chars(str, 0);
9712}
9713
9714/*
9715 * call-seq:
9716 * chars -> array_of_characters
9717 *
9718 * :include: doc/string/chars.rdoc
9719 *
9720 */
9721
9722static VALUE
9723rb_str_chars(VALUE str)
9724{
9725 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9726 return rb_str_enumerate_chars(str, ary);
9727}
9728
9729static VALUE
9730rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9731{
9732 VALUE orig = str;
9733 int n;
9734 unsigned int c;
9735 const char *ptr, *end;
9736 rb_encoding *enc;
9737
9738 if (single_byte_optimizable(str))
9739 return rb_str_enumerate_bytes(str, ary);
9740
9741 str = rb_str_new_frozen(str);
9742 ptr = RSTRING_PTR(str);
9743 end = RSTRING_END(str);
9744 enc = STR_ENC_GET(str);
9745
9746 while (ptr < end) {
9747 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9748 ENUM_ELEM(ary, UINT2NUM(c));
9749 ptr += n;
9750 }
9751 RB_GC_GUARD(str);
9752 if (ary)
9753 return ary;
9754 else
9755 return orig;
9756}
9757
9758/*
9759 * call-seq:
9760 * each_codepoint {|codepoint| ... } -> self
9761 * each_codepoint -> enumerator
9762 *
9763 * :include: doc/string/each_codepoint.rdoc
9764 *
9765 */
9766
9767static VALUE
9768rb_str_each_codepoint(VALUE str)
9769{
9770 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9771 return rb_str_enumerate_codepoints(str, 0);
9772}
9773
9774/*
9775 * call-seq:
9776 * codepoints -> array_of_integers
9777 *
9778 * :include: doc/string/codepoints.rdoc
9779 *
9780 */
9781
9782static VALUE
9783rb_str_codepoints(VALUE str)
9784{
9785 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9786 return rb_str_enumerate_codepoints(str, ary);
9787}
9788
9789static regex_t *
9790get_reg_grapheme_cluster(rb_encoding *enc)
9791{
9792 int encidx = rb_enc_to_index(enc);
9793
9794 const OnigUChar source_ascii[] = "\\X";
9795 const OnigUChar *source = source_ascii;
9796 size_t source_len = sizeof(source_ascii) - 1;
9797
9798 switch (encidx) {
9799#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9800#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9801#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9802#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9803#define CASE_UTF(e) \
9804 case ENCINDEX_UTF_##e: { \
9805 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9806 source = source_UTF_##e; \
9807 source_len = sizeof(source_UTF_##e); \
9808 break; \
9809 }
9810 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9811#undef CASE_UTF
9812#undef CHARS_16BE
9813#undef CHARS_16LE
9814#undef CHARS_32BE
9815#undef CHARS_32LE
9816 }
9817
9818 regex_t *reg_grapheme_cluster;
9819 OnigErrorInfo einfo;
9820 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9821 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9822 if (r) {
9823 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9824 onig_error_code_to_str(message, r, &einfo);
9825 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9826 }
9827
9828 return reg_grapheme_cluster;
9829}
9830
9831static regex_t *
9832get_cached_reg_grapheme_cluster(rb_encoding *enc)
9833{
9834 int encidx = rb_enc_to_index(enc);
9835 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9836
9837 if (encidx == rb_utf8_encindex()) {
9838 if (!reg_grapheme_cluster_utf8) {
9839 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9840 }
9841
9842 return reg_grapheme_cluster_utf8;
9843 }
9844
9845 return NULL;
9846}
9847
9848static VALUE
9849rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9850{
9851 size_t grapheme_cluster_count = 0;
9852 rb_encoding *enc = get_encoding(str);
9853 const char *ptr, *end;
9854
9855 if (!rb_enc_unicode_p(enc)) {
9856 return rb_str_length(str);
9857 }
9858
9859 bool cached_reg_grapheme_cluster = true;
9860 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9861 if (!reg_grapheme_cluster) {
9862 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9863 cached_reg_grapheme_cluster = false;
9864 }
9865
9866 ptr = RSTRING_PTR(str);
9867 end = RSTRING_END(str);
9868
9869 while (ptr < end) {
9870 OnigPosition len = onig_match(reg_grapheme_cluster,
9871 (const OnigUChar *)ptr, (const OnigUChar *)end,
9872 (const OnigUChar *)ptr, NULL, 0);
9873 if (len <= 0) break;
9874 grapheme_cluster_count++;
9875 ptr += len;
9876 }
9877
9878 if (!cached_reg_grapheme_cluster) {
9879 onig_free(reg_grapheme_cluster);
9880 }
9881
9882 return SIZET2NUM(grapheme_cluster_count);
9883}
9884
9885static VALUE
9886rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9887{
9888 VALUE orig = str;
9889 rb_encoding *enc = get_encoding(str);
9890 const char *ptr0, *ptr, *end;
9891
9892 if (!rb_enc_unicode_p(enc)) {
9893 return rb_str_enumerate_chars(str, ary);
9894 }
9895
9896 if (!ary) str = rb_str_new_frozen(str);
9897
9898 bool cached_reg_grapheme_cluster = true;
9899 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9900 if (!reg_grapheme_cluster) {
9901 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9902 cached_reg_grapheme_cluster = false;
9903 }
9904
9905 ptr0 = ptr = RSTRING_PTR(str);
9906 end = RSTRING_END(str);
9907
9908 while (ptr < end) {
9909 OnigPosition len = onig_match(reg_grapheme_cluster,
9910 (const OnigUChar *)ptr, (const OnigUChar *)end,
9911 (const OnigUChar *)ptr, NULL, 0);
9912 if (len <= 0) break;
9913 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9914 ptr += len;
9915 }
9916
9917 if (!cached_reg_grapheme_cluster) {
9918 onig_free(reg_grapheme_cluster);
9919 }
9920
9921 RB_GC_GUARD(str);
9922 if (ary)
9923 return ary;
9924 else
9925 return orig;
9926}
9927
9928/*
9929 * call-seq:
9930 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9931 * each_grapheme_cluster -> enumerator
9932 *
9933 * :include: doc/string/each_grapheme_cluster.rdoc
9934 *
9935 */
9936
9937static VALUE
9938rb_str_each_grapheme_cluster(VALUE str)
9939{
9940 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9941 return rb_str_enumerate_grapheme_clusters(str, 0);
9942}
9943
9944/*
9945 * call-seq:
9946 * grapheme_clusters -> array_of_grapheme_clusters
9947 *
9948 * :include: doc/string/grapheme_clusters.rdoc
9949 *
9950 */
9951
9952static VALUE
9953rb_str_grapheme_clusters(VALUE str)
9954{
9955 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9956 return rb_str_enumerate_grapheme_clusters(str, ary);
9957}
9958
9959static long
9960chopped_length(VALUE str)
9961{
9962 rb_encoding *enc = STR_ENC_GET(str);
9963 const char *p, *p2, *beg, *end;
9964
9965 beg = RSTRING_PTR(str);
9966 end = beg + RSTRING_LEN(str);
9967 if (beg >= end) return 0;
9968 p = rb_enc_prev_char(beg, end, end, enc);
9969 if (!p) return 0;
9970 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9971 p2 = rb_enc_prev_char(beg, p, end, enc);
9972 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9973 }
9974 return p - beg;
9975}
9976
9977/*
9978 * call-seq:
9979 * chop! -> self or nil
9980 *
9981 * Like String#chop, except that:
9982 *
9983 * - Removes trailing characters from +self+ (not from a copy of +self+).
9984 * - Returns +self+ if any characters are removed, +nil+ otherwise.
9985 *
9986 * Related: see {Modifying}[rdoc-ref:String@Modifying].
9987 */
9988
9989static VALUE
9990rb_str_chop_bang(VALUE str)
9991{
9992 str_modify_keep_cr(str);
9993 if (RSTRING_LEN(str) > 0) {
9994 long len;
9995 len = chopped_length(str);
9996 STR_SET_LEN(str, len);
9997 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9998 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10000 }
10001 return str;
10002 }
10003 return Qnil;
10004}
10005
10006
10007/*
10008 * call-seq:
10009 * chop -> new_string
10010 *
10011 * :include: doc/string/chop.rdoc
10012 *
10013 */
10014
10015static VALUE
10016rb_str_chop(VALUE str)
10017{
10018 return rb_str_subseq(str, 0, chopped_length(str));
10019}
10020
10021static long
10022smart_chomp(VALUE str, const char *e, const char *p)
10023{
10024 rb_encoding *enc = rb_enc_get(str);
10025 if (rb_enc_mbminlen(enc) > 1) {
10026 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10027 if (rb_enc_is_newline(pp, e, enc)) {
10028 e = pp;
10029 }
10030 pp = e - rb_enc_mbminlen(enc);
10031 if (pp >= p) {
10032 pp = rb_enc_left_char_head(p, pp, e, enc);
10033 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10034 e = pp;
10035 }
10036 }
10037 }
10038 else {
10039 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10040 case '\n':
10041 if (--e > p && *(e-1) == '\r') {
10042 --e;
10043 }
10044 break;
10045 case '\r':
10046 --e;
10047 break;
10048 }
10049 }
10050 return e - p;
10051}
10052
10053static long
10054chompped_length(VALUE str, VALUE rs)
10055{
10056 rb_encoding *enc;
10057 int newline;
10058 char *pp, *e, *rsptr;
10059 long rslen;
10060 char *const p = RSTRING_PTR(str);
10061 long len = RSTRING_LEN(str);
10062
10063 if (len == 0) return 0;
10064 e = p + len;
10065 if (rs == rb_default_rs) {
10066 return smart_chomp(str, e, p);
10067 }
10068
10069 enc = rb_enc_get(str);
10070 RSTRING_GETMEM(rs, rsptr, rslen);
10071 if (rslen == 0) {
10072 if (rb_enc_mbminlen(enc) > 1) {
10073 while (e > p) {
10074 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10075 if (!rb_enc_is_newline(pp, e, enc)) break;
10076 e = pp;
10077 pp -= rb_enc_mbminlen(enc);
10078 if (pp >= p) {
10079 pp = rb_enc_left_char_head(p, pp, e, enc);
10080 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10081 e = pp;
10082 }
10083 }
10084 }
10085 }
10086 else {
10087 while (e > p && *(e-1) == '\n') {
10088 --e;
10089 if (e > p && *(e-1) == '\r')
10090 --e;
10091 }
10092 }
10093 return e - p;
10094 }
10095 if (rslen > len) return len;
10096
10097 enc = rb_enc_get(rs);
10098 newline = rsptr[rslen-1];
10099 if (rslen == rb_enc_mbminlen(enc)) {
10100 if (rslen == 1) {
10101 if (newline == '\n')
10102 return smart_chomp(str, e, p);
10103 }
10104 else {
10105 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10106 return smart_chomp(str, e, p);
10107 }
10108 }
10109
10110 enc = rb_enc_check(str, rs);
10111 if (is_broken_string(rs)) {
10112 return len;
10113 }
10114 pp = e - rslen;
10115 if (p[len-1] == newline &&
10116 (rslen <= 1 ||
10117 memcmp(rsptr, pp, rslen) == 0)) {
10118 if (at_char_boundary(p, pp, e, enc))
10119 return len - rslen;
10120 RB_GC_GUARD(rs);
10121 }
10122 return len;
10123}
10124
10130static VALUE
10131chomp_rs(int argc, const VALUE *argv)
10132{
10133 rb_check_arity(argc, 0, 1);
10134 if (argc > 0) {
10135 VALUE rs = argv[0];
10136 if (!NIL_P(rs)) StringValue(rs);
10137 return rs;
10138 }
10139 else {
10140 return rb_rs;
10141 }
10142}
10143
10144VALUE
10145rb_str_chomp_string(VALUE str, VALUE rs)
10146{
10147 long olen = RSTRING_LEN(str);
10148 long len = chompped_length(str, rs);
10149 if (len >= olen) return Qnil;
10150 str_modify_keep_cr(str);
10151 STR_SET_LEN(str, len);
10152 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10153 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10155 }
10156 return str;
10157}
10158
10159/*
10160 * call-seq:
10161 * chomp!(line_sep = $/) -> self or nil
10162 *
10163 * Like String#chomp, except that:
10164 *
10165 * - Removes trailing characters from +self+ (not from a copy of +self+).
10166 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10167 *
10168 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10169 */
10170
10171static VALUE
10172rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10173{
10174 VALUE rs;
10175 str_modifiable(str);
10176 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10177 rs = chomp_rs(argc, argv);
10178 if (NIL_P(rs)) return Qnil;
10179 return rb_str_chomp_string(str, rs);
10180}
10181
10182
10183/*
10184 * call-seq:
10185 * chomp(line_sep = $/) -> new_string
10186 *
10187 * :include: doc/string/chomp.rdoc
10188 *
10189 */
10190
10191static VALUE
10192rb_str_chomp(int argc, VALUE *argv, VALUE str)
10193{
10194 VALUE rs = chomp_rs(argc, argv);
10195 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10196 return rb_str_subseq(str, 0, chompped_length(str, rs));
10197}
10198
10199static void
10200tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10201 VALUE str, int num_selectors, VALUE *selectors)
10202{
10203 int i;
10204
10205 for (i=0; i<num_selectors; i++) {
10206 VALUE selector = selectors[i];
10207 rb_encoding *enc;
10208
10209 StringValue(selector);
10210 enc = rb_enc_check(str, selector);
10211 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10212 }
10213}
10214
10215static long
10216lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10217{
10218 const char *const start = s;
10219
10220 if (!s || s >= e) return 0;
10221
10222 /* remove spaces at head */
10223 if (single_byte_optimizable(str)) {
10224 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10225 }
10226 else {
10227 while (s < e) {
10228 int n;
10229 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10230
10231 if (cc && !rb_isspace(cc)) break;
10232 s += n;
10233 }
10234 }
10235 return s - start;
10236}
10237
10238static long
10239lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10240 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10241{
10242 const char *const start = s;
10243
10244 if (!s || s >= e) return 0;
10245
10246 /* remove leading characters in the table */
10247 while (s < e) {
10248 int n;
10249 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10250
10251 if (!tr_find(cc, table, del, nodel)) break;
10252 s += n;
10253 }
10254 return s - start;
10255}
10256
10257/*
10258 * call-seq:
10259 * lstrip!(*selectors) -> self or nil
10260 *
10261 * Like String#lstrip, except that:
10262 *
10263 * - Performs stripping in +self+ (not in a copy of +self+).
10264 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10265 *
10266 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10267 */
10268
10269static VALUE
10270rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10271{
10272 rb_encoding *enc;
10273 char *start, *s;
10274 long olen, loffset;
10275
10276 str_modify_keep_cr(str);
10277 enc = STR_ENC_GET(str);
10278 RSTRING_GETMEM(str, start, olen);
10279 if (argc > 0) {
10280 char table[TR_TABLE_SIZE];
10281 VALUE del = 0, nodel = 0;
10282
10283 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10284 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10285 }
10286 else {
10287 loffset = lstrip_offset(str, start, start+olen, enc);
10288 }
10289
10290 if (loffset > 0) {
10291 long len = olen-loffset;
10292 s = start + loffset;
10293 memmove(start, s, len);
10294 STR_SET_LEN(str, len);
10295 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10296 return str;
10297 }
10298 return Qnil;
10299}
10300
10301
10302/*
10303 * call-seq:
10304 * lstrip(*selectors) -> new_string
10305 *
10306 * Returns a copy of +self+ with leading whitespace removed;
10307 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10308 *
10309 * whitespace = "\x00\t\n\v\f\r "
10310 * s = whitespace + 'abc' + whitespace
10311 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10312 * s.lstrip
10313 * # => "abc\u0000\t\n\v\f\r "
10314 *
10315 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10316 *
10317 * s = "---abc+++"
10318 * s.lstrip("-") # => "abc+++"
10319 *
10320 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10321 * and may use any of its valid forms, including negation, ranges, and escapes:
10322 *
10323 * "01234abc56789".lstrip("0-9") # "abc56789"
10324 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10325 *
10326 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10327 */
10328
10329static VALUE
10330rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10331{
10332 char *start;
10333 long len, loffset;
10334
10335 RSTRING_GETMEM(str, start, len);
10336 if (argc > 0) {
10337 char table[TR_TABLE_SIZE];
10338 VALUE del = 0, nodel = 0;
10339
10340 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10341 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10342 }
10343 else {
10344 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10345 }
10346 if (loffset <= 0) return str_duplicate(rb_cString, str);
10347 return rb_str_subseq(str, loffset, len - loffset);
10348}
10349
10350static long
10351rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10352{
10353 const char *t;
10354
10355 rb_str_check_dummy_enc(enc);
10357 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10358 }
10359 if (!s || s >= e) return 0;
10360 t = e;
10361
10362 /* remove trailing spaces or '\0's */
10363 if (single_byte_optimizable(str)) {
10364 unsigned char c;
10365 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10366 }
10367 else {
10368 char *tp;
10369
10370 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10371 unsigned int c = rb_enc_codepoint(tp, e, enc);
10372 if (c && !rb_isspace(c)) break;
10373 t = tp;
10374 }
10375 }
10376 return e - t;
10377}
10378
10379static long
10380rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10381 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10382{
10383 const char *t;
10384 char *tp;
10385
10386 rb_str_check_dummy_enc(enc);
10388 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10389 }
10390 if (!s || s >= e) return 0;
10391 t = e;
10392
10393 /* remove trailing characters in the table */
10394 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10395 unsigned int c = rb_enc_codepoint(tp, e, enc);
10396 if (!tr_find(c, table, del, nodel)) break;
10397 t = tp;
10398 }
10399
10400 return e - t;
10401}
10402
10403/*
10404 * call-seq:
10405 * rstrip!(*selectors) -> self or nil
10406 *
10407 * Like String#rstrip, except that:
10408 *
10409 * - Performs stripping in +self+ (not in a copy of +self+).
10410 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10411 *
10412 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10413 */
10414
10415static VALUE
10416rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10417{
10418 rb_encoding *enc;
10419 char *start;
10420 long olen, roffset;
10421
10422 str_modify_keep_cr(str);
10423 enc = STR_ENC_GET(str);
10424 RSTRING_GETMEM(str, start, olen);
10425 if (argc > 0) {
10426 char table[TR_TABLE_SIZE];
10427 VALUE del = 0, nodel = 0;
10428
10429 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10430 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10431 }
10432 else {
10433 roffset = rstrip_offset(str, start, start+olen, enc);
10434 }
10435 if (roffset > 0) {
10436 long len = olen - roffset;
10437
10438 STR_SET_LEN(str, len);
10439 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10440 return str;
10441 }
10442 return Qnil;
10443}
10444
10445
10446/*
10447 * call-seq:
10448 * rstrip(*selectors) -> new_string
10449 *
10450 * Returns a copy of +self+ with trailing whitespace removed;
10451 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10452 *
10453 * whitespace = "\x00\t\n\v\f\r "
10454 * s = whitespace + 'abc' + whitespace
10455 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10456 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10457 *
10458 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10459 *
10460 * s = "---abc+++"
10461 * s.rstrip("+") # => "---abc"
10462 *
10463 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10464 * and may use any of its valid forms, including negation, ranges, and escapes:
10465 *
10466 * "01234abc56789".rstrip("0-9") # "01234abc"
10467 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10468 *
10469 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10470 */
10471
10472static VALUE
10473rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10474{
10475 rb_encoding *enc;
10476 char *start;
10477 long olen, roffset;
10478
10479 enc = STR_ENC_GET(str);
10480 RSTRING_GETMEM(str, start, olen);
10481 if (argc > 0) {
10482 char table[TR_TABLE_SIZE];
10483 VALUE del = 0, nodel = 0;
10484
10485 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10486 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10487 }
10488 else {
10489 roffset = rstrip_offset(str, start, start+olen, enc);
10490 }
10491 if (roffset <= 0) return str_duplicate(rb_cString, str);
10492 return rb_str_subseq(str, 0, olen-roffset);
10493}
10494
10495
10496/*
10497 * call-seq:
10498 * strip!(*selectors) -> self or nil
10499 *
10500 * Like String#strip, except that:
10501 *
10502 * - Any modifications are made to +self+.
10503 * - Returns +self+ if any modification are made, +nil+ otherwise.
10504 *
10505 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10506 */
10507
10508static VALUE
10509rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10510{
10511 char *start;
10512 long olen, loffset, roffset;
10513 rb_encoding *enc;
10514
10515 str_modify_keep_cr(str);
10516 enc = STR_ENC_GET(str);
10517 RSTRING_GETMEM(str, start, olen);
10518
10519 if (argc > 0) {
10520 char table[TR_TABLE_SIZE];
10521 VALUE del = 0, nodel = 0;
10522
10523 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10524 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10525 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10526 }
10527 else {
10528 loffset = lstrip_offset(str, start, start+olen, enc);
10529 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10530 }
10531
10532 if (loffset > 0 || roffset > 0) {
10533 long len = olen-roffset;
10534 if (loffset > 0) {
10535 len -= loffset;
10536 memmove(start, start + loffset, len);
10537 }
10538 STR_SET_LEN(str, len);
10539 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10540 return str;
10541 }
10542 return Qnil;
10543}
10544
10545
10546/*
10547 * call-seq:
10548 * strip(*selectors) -> new_string
10549 *
10550 * Returns a copy of +self+ with leading and trailing whitespace removed;
10551 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10552 *
10553 * whitespace = "\x00\t\n\v\f\r "
10554 * s = whitespace + 'abc' + whitespace
10555 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10556 * s.strip # => "abc"
10557 *
10558 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10559 *
10560 * s = "---abc+++"
10561 * s.strip("-+") # => "abc"
10562 * s.strip("+-") # => "abc"
10563 *
10564 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10565 * and may use any of its valid forms, including negation, ranges, and escapes:
10566 *
10567 * "01234abc56789".strip("0-9") # "abc"
10568 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10569 *
10570 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10571 */
10572
10573static VALUE
10574rb_str_strip(int argc, VALUE *argv, VALUE str)
10575{
10576 char *start;
10577 long olen, loffset, roffset;
10578 rb_encoding *enc = STR_ENC_GET(str);
10579
10580 RSTRING_GETMEM(str, start, olen);
10581
10582 if (argc > 0) {
10583 char table[TR_TABLE_SIZE];
10584 VALUE del = 0, nodel = 0;
10585
10586 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10587 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10588 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10589 }
10590 else {
10591 loffset = lstrip_offset(str, start, start+olen, enc);
10592 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10593 }
10594
10595 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10596 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10597}
10598
10599static VALUE
10600scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10601{
10602 VALUE result = Qnil;
10603 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10604 if (pos >= 0) {
10605 VALUE match;
10606 struct re_registers *regs;
10607 if (BUILTIN_TYPE(pat) == T_STRING) {
10608 regs = NULL;
10609 end = pos + RSTRING_LEN(pat);
10610 }
10611 else {
10612 match = rb_backref_get();
10613 regs = RMATCH_REGS(match);
10614 pos = BEG(0);
10615 end = END(0);
10616 }
10617
10618 if (pos == end) {
10619 rb_encoding *enc = STR_ENC_GET(str);
10620 /*
10621 * Always consume at least one character of the input string
10622 */
10623 if (RSTRING_LEN(str) > end)
10624 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10625 RSTRING_END(str), enc);
10626 else
10627 *start = end + 1;
10628 }
10629 else {
10630 *start = end;
10631 }
10632
10633 if (!regs || regs->num_regs == 1) {
10634 result = rb_str_subseq(str, pos, end - pos);
10635 return result;
10636 }
10637 else {
10638 result = rb_ary_new2(regs->num_regs);
10639 for (int i = 1; i < regs->num_regs; i++) {
10640 VALUE s = Qnil;
10641 if (BEG(i) >= 0) {
10642 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10643 }
10644
10645 rb_ary_push(result, s);
10646 }
10647 }
10648
10649 RB_GC_GUARD(match);
10650 }
10651
10652 return result;
10653}
10654
10655
10656/*
10657 * call-seq:
10658 * scan(pattern) -> array_of_results
10659 * scan(pattern) {|result| ... } -> self
10660 *
10661 * :include: doc/string/scan.rdoc
10662 *
10663 */
10664
10665static VALUE
10666rb_str_scan(VALUE str, VALUE pat)
10667{
10668 VALUE result;
10669 long start = 0;
10670 long last = -1, prev = 0;
10671 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10672
10673 pat = get_pat_quoted(pat, 1);
10674 mustnot_broken(str);
10675 if (!rb_block_given_p()) {
10676 VALUE ary = rb_ary_new();
10677
10678 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10679 last = prev;
10680 prev = start;
10681 rb_ary_push(ary, result);
10682 }
10683 if (last >= 0) rb_pat_search(pat, str, last, 1);
10684 else rb_backref_set(Qnil);
10685 return ary;
10686 }
10687
10688 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10689 last = prev;
10690 prev = start;
10691 rb_yield(result);
10692 str_mod_check(str, p, len);
10693 }
10694 if (last >= 0) rb_pat_search(pat, str, last, 1);
10695 return str;
10696}
10697
10698
10699/*
10700 * call-seq:
10701 * hex -> integer
10702 *
10703 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10704 * returns its value as an integer.
10705 *
10706 * The leading substring is interpreted as hexadecimal when it begins with:
10707 *
10708 * - One or more character representing hexadecimal digits
10709 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10710 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10711 *
10712 * 'f'.hex # => 15
10713 * '11'.hex # => 17
10714 * 'FFF'.hex # => 4095
10715 * 'fffg'.hex # => 4095
10716 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10717 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10718 * 'deadbeef'.hex # => 3735928559
10719 *
10720 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10721 *
10722 * '0xfff'.hex # => 4095
10723 * '0xfffg'.hex # => 4095
10724 *
10725 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10726 *
10727 * '-fff'.hex # => -4095
10728 * '-0xFFF'.hex # => -4095
10729 *
10730 * For any substring not described above, returns zero:
10731 *
10732 * 'xxx'.hex # => 0
10733 * ''.hex # => 0
10734 *
10735 * Note that, unlike #oct, this method interprets only hexadecimal,
10736 * and not binary, octal, or decimal notations:
10737 *
10738 * '0b111'.hex # => 45329
10739 * '0o777'.hex # => 0
10740 * '0d999'.hex # => 55705
10741 *
10742 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10743 */
10744
10745static VALUE
10746rb_str_hex(VALUE str)
10747{
10748 return rb_str_to_inum(str, 16, FALSE);
10749}
10750
10751
10752/*
10753 * call-seq:
10754 * oct -> integer
10755 *
10756 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10757 * returns their value as an integer.
10758 *
10759 * In brief:
10760 *
10761 * # Interpreted as octal.
10762 * '777'.oct # => 511
10763 * '777x'.oct # => 511
10764 * '0777'.oct # => 511
10765 * '0o777'.oct # => 511
10766 * '-777'.oct # => -511
10767 * # Not interpreted as octal.
10768 * '0b111'.oct # => 7 # Interpreted as binary.
10769 * '0d999'.oct # => 999 # Interpreted as decimal.
10770 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10771 *
10772 * The leading substring is interpreted as octal when it begins with:
10773 *
10774 * - One or more character representing octal digits
10775 * (each in the range <tt>'0'..'7'</tt>);
10776 * the string to be interpreted ends at the first character that does not represent an octal digit:
10777 *
10778 * '7'.oct @ => 7
10779 * '11'.oct # => 9
10780 * '777'.oct # => 511
10781 * '0777'.oct # => 511
10782 * '7778'.oct # => 511
10783 * '777x'.oct # => 511
10784 *
10785 * - <tt>'0o'</tt>, followed by one or more octal digits:
10786 *
10787 * '0o777'.oct # => 511
10788 * '0o7778'.oct # => 511
10789 *
10790 * The leading substring is _not_ interpreted as octal when it begins with:
10791 *
10792 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10793 * (each in the range <tt>'0'..'1'</tt>);
10794 * the string to be interpreted ends at the first character that does not represent a binary digit.
10795 * the string is interpreted as binary digits (base 2):
10796 *
10797 * '0b111'.oct # => 7
10798 * '0b1112'.oct # => 7
10799 *
10800 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10801 * (each in the range <tt>'0'..'9'</tt>);
10802 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10803 * the string is interpreted as decimal digits (base 10):
10804 *
10805 * '0d999'.oct # => 999
10806 * '0d999x'.oct # => 999
10807 *
10808 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10809 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10810 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10811 * the string is interpreted as hexadecimal digits (base 16):
10812 *
10813 * '0xfff'.oct # => 4095
10814 * '0xfffg'.oct # => 4095
10815 *
10816 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10817 *
10818 * '-777'.oct # => -511
10819 * '-0777'.oct # => -511
10820 * '-0b111'.oct # => -7
10821 * '-0xfff'.oct # => -4095
10822 *
10823 * For any substring not described above, returns zero:
10824 *
10825 * 'foo'.oct # => 0
10826 * ''.oct # => 0
10827 *
10828 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non--5CString].
10829 */
10830
10831static VALUE
10832rb_str_oct(VALUE str)
10833{
10834 return rb_str_to_inum(str, -8, FALSE);
10835}
10836
10837#ifndef HAVE_CRYPT_R
10838# include "ruby/thread_native.h"
10839# include "ruby/atomic.h"
10840
10841static struct {
10842 rb_nativethread_lock_t lock;
10843} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10844#endif
10845
10846/*
10847 * call-seq:
10848 * crypt(salt_str) -> new_string
10849 *
10850 * Returns the string generated by calling <code>crypt(3)</code>
10851 * standard library function with <code>str</code> and
10852 * <code>salt_str</code>, in this order, as its arguments. Please do
10853 * not use this method any longer. It is legacy; provided only for
10854 * backward compatibility with ruby scripts in earlier days. It is
10855 * bad to use in contemporary programs for several reasons:
10856 *
10857 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10858 * run. The generated string lacks data portability.
10859 *
10860 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10861 * (i.e. silently ends up in unexpected results).
10862 *
10863 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10864 * thread safe.
10865 *
10866 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10867 * very very weak. According to its manpage, Linux's traditional
10868 * <code>crypt(3)</code> output has only 2**56 variations; too
10869 * easy to brute force today. And this is the default behaviour.
10870 *
10871 * * In order to make things robust some OSes implement so-called
10872 * "modular" usage. To go through, you have to do a complex
10873 * build-up of the <code>salt_str</code> parameter, by hand.
10874 * Failure in generation of a proper salt string tends not to
10875 * yield any errors; typos in parameters are normally not
10876 * detectable.
10877 *
10878 * * For instance, in the following example, the second invocation
10879 * of String#crypt is wrong; it has a typo in "round=" (lacks
10880 * "s"). However the call does not fail and something unexpected
10881 * is generated.
10882 *
10883 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10884 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10885 *
10886 * * Even in the "modular" mode, some hash functions are considered
10887 * archaic and no longer recommended at all; for instance module
10888 * <code>$1$</code> is officially abandoned by its author: see
10889 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10890 * instance module <code>$3$</code> is considered completely
10891 * broken: see the manpage of FreeBSD.
10892 *
10893 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10894 * written above, <code>crypt(3)</code> on Mac OS never fails.
10895 * This means even if you build up a proper salt string it
10896 * generates a traditional DES hash anyways, and there is no way
10897 * for you to be aware of.
10898 *
10899 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10900 *
10901 * If for some reason you cannot migrate to other secure contemporary
10902 * password hashing algorithms, install the string-crypt gem and
10903 * <code>require 'string/crypt'</code> to continue using it.
10904 */
10905
10906static VALUE
10907rb_str_crypt(VALUE str, VALUE salt)
10908{
10909#ifdef HAVE_CRYPT_R
10910 VALUE databuf;
10911 struct crypt_data *data;
10912# define CRYPT_END() ALLOCV_END(databuf)
10913#else
10914 char *tmp_buf;
10915 extern char *crypt(const char *, const char *);
10916# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10917#endif
10918 VALUE result;
10919 const char *s, *saltp;
10920 char *res;
10921#ifdef BROKEN_CRYPT
10922 char salt_8bit_clean[3];
10923#endif
10924
10925 StringValue(salt);
10926 mustnot_wchar(str);
10927 mustnot_wchar(salt);
10928 s = StringValueCStr(str);
10929 saltp = RSTRING_PTR(salt);
10930 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10931 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10932 }
10933
10934#ifdef BROKEN_CRYPT
10935 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10936 salt_8bit_clean[0] = saltp[0] & 0x7f;
10937 salt_8bit_clean[1] = saltp[1] & 0x7f;
10938 salt_8bit_clean[2] = '\0';
10939 saltp = salt_8bit_clean;
10940 }
10941#endif
10942#ifdef HAVE_CRYPT_R
10943 data = ALLOCV(databuf, sizeof(struct crypt_data));
10944# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10945 data->initialized = 0;
10946# endif
10947 res = crypt_r(s, saltp, data);
10948#else
10949 rb_nativethread_lock_lock(&crypt_mutex.lock);
10950 res = crypt(s, saltp);
10951#endif
10952 if (!res) {
10953 int err = errno;
10954 CRYPT_END();
10955 rb_syserr_fail(err, "crypt");
10956 }
10957#ifdef HAVE_CRYPT_R
10958 result = rb_str_new_cstr(res);
10959 CRYPT_END();
10960#else
10961 // We need to copy this buffer because it's static and we need to unlock the mutex
10962 // before allocating a new object (the string to be returned). If we allocate while
10963 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10964 // if other ractors are waiting on this lock.
10965 size_t res_size = strlen(res)+1;
10966 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10967 memcpy(tmp_buf, res, res_size);
10968 res = tmp_buf;
10969 CRYPT_END();
10970 result = rb_str_new_cstr(res);
10971#endif
10972 return result;
10973}
10974
10975
10976/*
10977 * call-seq:
10978 * ord -> integer
10979 *
10980 * :include: doc/string/ord.rdoc
10981 *
10982 */
10983
10984static VALUE
10985rb_str_ord(VALUE s)
10986{
10987 unsigned int c;
10988
10989 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10990 return UINT2NUM(c);
10991}
10992/*
10993 * call-seq:
10994 * sum(n = 16) -> integer
10995 *
10996 * :include: doc/string/sum.rdoc
10997 *
10998 */
10999
11000static VALUE
11001rb_str_sum(int argc, VALUE *argv, VALUE str)
11002{
11003 int bits = 16;
11004 char *ptr, *p, *pend;
11005 long len;
11006 VALUE sum = INT2FIX(0);
11007 unsigned long sum0 = 0;
11008
11009 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11010 bits = 0;
11011 }
11012 ptr = p = RSTRING_PTR(str);
11013 len = RSTRING_LEN(str);
11014 pend = p + len;
11015
11016 while (p < pend) {
11017 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11018 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11019 str_mod_check(str, ptr, len);
11020 sum0 = 0;
11021 }
11022 sum0 += (unsigned char)*p;
11023 p++;
11024 }
11025
11026 if (bits == 0) {
11027 if (sum0) {
11028 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11029 }
11030 }
11031 else {
11032 if (sum == INT2FIX(0)) {
11033 if (bits < (int)sizeof(long)*CHAR_BIT) {
11034 sum0 &= (((unsigned long)1)<<bits)-1;
11035 }
11036 sum = LONG2FIX(sum0);
11037 }
11038 else {
11039 VALUE mod;
11040
11041 if (sum0) {
11042 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11043 }
11044
11045 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11046 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11047 sum = rb_funcall(sum, '&', 1, mod);
11048 }
11049 }
11050 return sum;
11051}
11052
11053static VALUE
11054rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11055{
11056 rb_encoding *enc;
11057 VALUE w;
11058 long width, len, flen = 1, fclen = 1;
11059 VALUE res;
11060 char *p;
11061 const char *f = " ";
11062 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11063 VALUE pad;
11064 int singlebyte = 1, cr;
11065 int termlen;
11066
11067 rb_scan_args(argc, argv, "11", &w, &pad);
11068 enc = STR_ENC_GET(str);
11069 termlen = rb_enc_mbminlen(enc);
11070 width = NUM2LONG(w);
11071 if (argc == 2) {
11072 StringValue(pad);
11073 enc = rb_enc_check(str, pad);
11074 f = RSTRING_PTR(pad);
11075 flen = RSTRING_LEN(pad);
11076 fclen = str_strlen(pad, enc); /* rb_enc_check */
11077 singlebyte = single_byte_optimizable(pad);
11078 if (flen == 0 || fclen == 0) {
11079 rb_raise(rb_eArgError, "zero width padding");
11080 }
11081 }
11082 len = str_strlen(str, enc); /* rb_enc_check */
11083 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11084 n = width - len;
11085 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11086 rlen = n - llen;
11087 cr = ENC_CODERANGE(str);
11088 if (flen > 1) {
11089 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11090 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11091 }
11092 size = RSTRING_LEN(str);
11093 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11094 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11095 (len += llen2 + rlen2) >= LONG_MAX - size) {
11096 rb_raise(rb_eArgError, "argument too big");
11097 }
11098 len += size;
11099 res = str_enc_new(rb_cString, 0, len, enc);
11100 p = RSTRING_PTR(res);
11101 if (flen <= 1) {
11102 memset(p, *f, llen);
11103 p += llen;
11104 }
11105 else {
11106 while (llen >= fclen) {
11107 memcpy(p,f,flen);
11108 p += flen;
11109 llen -= fclen;
11110 }
11111 if (llen > 0) {
11112 memcpy(p, f, llen2);
11113 p += llen2;
11114 }
11115 }
11116 memcpy(p, RSTRING_PTR(str), size);
11117 p += size;
11118 if (flen <= 1) {
11119 memset(p, *f, rlen);
11120 p += rlen;
11121 }
11122 else {
11123 while (rlen >= fclen) {
11124 memcpy(p,f,flen);
11125 p += flen;
11126 rlen -= fclen;
11127 }
11128 if (rlen > 0) {
11129 memcpy(p, f, rlen2);
11130 p += rlen2;
11131 }
11132 }
11133 TERM_FILL(p, termlen);
11134 STR_SET_LEN(res, p-RSTRING_PTR(res));
11135
11136 if (argc == 2)
11137 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11138 if (cr != ENC_CODERANGE_BROKEN)
11139 ENC_CODERANGE_SET(res, cr);
11140
11141 RB_GC_GUARD(pad);
11142 return res;
11143}
11144
11145
11146/*
11147 * call-seq:
11148 * ljust(width, pad_string = ' ') -> new_string
11149 *
11150 * :include: doc/string/ljust.rdoc
11151 *
11152 */
11153
11154static VALUE
11155rb_str_ljust(int argc, VALUE *argv, VALUE str)
11156{
11157 return rb_str_justify(argc, argv, str, 'l');
11158}
11159
11160/*
11161 * call-seq:
11162 * rjust(width, pad_string = ' ') -> new_string
11163 *
11164 * :include: doc/string/rjust.rdoc
11165 *
11166 */
11167
11168static VALUE
11169rb_str_rjust(int argc, VALUE *argv, VALUE str)
11170{
11171 return rb_str_justify(argc, argv, str, 'r');
11172}
11173
11174
11175/*
11176 * call-seq:
11177 * center(size, pad_string = ' ') -> new_string
11178 *
11179 * :include: doc/string/center.rdoc
11180 *
11181 */
11182
11183static VALUE
11184rb_str_center(int argc, VALUE *argv, VALUE str)
11185{
11186 return rb_str_justify(argc, argv, str, 'c');
11187}
11188
11189/*
11190 * call-seq:
11191 * partition(pattern) -> [pre_match, first_match, post_match]
11192 *
11193 * :include: doc/string/partition.rdoc
11194 *
11195 */
11196
11197static VALUE
11198rb_str_partition(VALUE str, VALUE sep)
11199{
11200 long pos;
11201
11202 sep = get_pat_quoted(sep, 0);
11203 if (RB_TYPE_P(sep, T_REGEXP)) {
11204 if (rb_reg_search(sep, str, 0, 0) < 0) {
11205 goto failed;
11206 }
11207 VALUE match = rb_backref_get();
11208 struct re_registers *regs = RMATCH_REGS(match);
11209
11210 pos = BEG(0);
11211 sep = rb_str_subseq(str, pos, END(0) - pos);
11212 }
11213 else {
11214 pos = rb_str_index(str, sep, 0);
11215 if (pos < 0) goto failed;
11216 }
11217 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11218 sep,
11219 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11220 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11221
11222 failed:
11223 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11224}
11225
11226/*
11227 * call-seq:
11228 * rpartition(pattern) -> [pre_match, last_match, post_match]
11229 *
11230 * :include: doc/string/rpartition.rdoc
11231 *
11232 */
11233
11234static VALUE
11235rb_str_rpartition(VALUE str, VALUE sep)
11236{
11237 long pos = RSTRING_LEN(str);
11238
11239 sep = get_pat_quoted(sep, 0);
11240 if (RB_TYPE_P(sep, T_REGEXP)) {
11241 if (rb_reg_search(sep, str, pos, 1) < 0) {
11242 goto failed;
11243 }
11244 VALUE match = rb_backref_get();
11245 struct re_registers *regs = RMATCH_REGS(match);
11246
11247 pos = BEG(0);
11248 sep = rb_str_subseq(str, pos, END(0) - pos);
11249 }
11250 else {
11251 pos = rb_str_sublen(str, pos);
11252 pos = rb_str_rindex(str, sep, pos);
11253 if (pos < 0) {
11254 goto failed;
11255 }
11256 }
11257
11258 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11259 sep,
11260 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11261 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11262 failed:
11263 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11264}
11265
11266/*
11267 * call-seq:
11268 * start_with?(*patterns) -> true or false
11269 *
11270 * :include: doc/string/start_with_p.rdoc
11271 *
11272 */
11273
11274static VALUE
11275rb_str_start_with(int argc, VALUE *argv, VALUE str)
11276{
11277 int i;
11278
11279 for (i=0; i<argc; i++) {
11280 VALUE tmp = argv[i];
11281 if (RB_TYPE_P(tmp, T_REGEXP)) {
11282 if (rb_reg_start_with_p(tmp, str))
11283 return Qtrue;
11284 }
11285 else {
11286 const char *p, *s, *e;
11287 long slen, tlen;
11288 rb_encoding *enc;
11289
11290 StringValue(tmp);
11291 enc = rb_enc_check(str, tmp);
11292 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11293 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11294 p = RSTRING_PTR(str);
11295 e = p + slen;
11296 s = p + tlen;
11297 if (!at_char_right_boundary(p, s, e, enc))
11298 continue;
11299 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11300 return Qtrue;
11301 }
11302 }
11303 return Qfalse;
11304}
11305
11306/*
11307 * call-seq:
11308 * end_with?(*strings) -> true or false
11309 *
11310 * :include: doc/string/end_with_p.rdoc
11311 *
11312 */
11313
11314static VALUE
11315rb_str_end_with(int argc, VALUE *argv, VALUE str)
11316{
11317 int i;
11318
11319 for (i=0; i<argc; i++) {
11320 VALUE tmp = argv[i];
11321 const char *p, *s, *e;
11322 long slen, tlen;
11323 rb_encoding *enc;
11324
11325 StringValue(tmp);
11326 enc = rb_enc_check(str, tmp);
11327 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11328 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11329 p = RSTRING_PTR(str);
11330 e = p + slen;
11331 s = e - tlen;
11332 if (!at_char_boundary(p, s, e, enc))
11333 continue;
11334 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11335 return Qtrue;
11336 }
11337 return Qfalse;
11338}
11339
11349static long
11350deleted_prefix_length(VALUE str, VALUE prefix)
11351{
11352 const char *strptr, *prefixptr;
11353 long olen, prefixlen;
11354 rb_encoding *enc = rb_enc_get(str);
11355
11356 StringValue(prefix);
11357
11358 if (!is_broken_string(prefix) ||
11359 !rb_enc_asciicompat(enc) ||
11360 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11361 enc = rb_enc_check(str, prefix);
11362 }
11363
11364 /* return 0 if not start with prefix */
11365 prefixlen = RSTRING_LEN(prefix);
11366 if (prefixlen <= 0) return 0;
11367 olen = RSTRING_LEN(str);
11368 if (olen < prefixlen) return 0;
11369 strptr = RSTRING_PTR(str);
11370 prefixptr = RSTRING_PTR(prefix);
11371 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11372 if (is_broken_string(prefix)) {
11373 if (!is_broken_string(str)) {
11374 /* prefix in a valid string cannot be broken */
11375 return 0;
11376 }
11377 const char *strend = strptr + olen;
11378 const char *after_prefix = strptr + prefixlen;
11379 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11380 /* prefix does not end at char-boundary */
11381 return 0;
11382 }
11383 }
11384 /* prefix part in `str` also should be valid. */
11385
11386 return prefixlen;
11387}
11388
11389/*
11390 * call-seq:
11391 * delete_prefix!(prefix) -> self or nil
11392 *
11393 * Like String#delete_prefix, except that +self+ is modified in place;
11394 * returns +self+ if the prefix is removed, +nil+ otherwise.
11395 *
11396 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11397 */
11398
11399static VALUE
11400rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11401{
11402 long prefixlen;
11403 str_modify_keep_cr(str);
11404
11405 prefixlen = deleted_prefix_length(str, prefix);
11406 if (prefixlen <= 0) return Qnil;
11407
11408 return rb_str_drop_bytes(str, prefixlen);
11409}
11410
11411/*
11412 * call-seq:
11413 * delete_prefix(prefix) -> new_string
11414 *
11415 * :include: doc/string/delete_prefix.rdoc
11416 *
11417 */
11418
11419static VALUE
11420rb_str_delete_prefix(VALUE str, VALUE prefix)
11421{
11422 long prefixlen;
11423
11424 prefixlen = deleted_prefix_length(str, prefix);
11425 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11426
11427 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11428}
11429
11439static long
11440deleted_suffix_length(VALUE str, VALUE suffix)
11441{
11442 const char *strptr, *suffixptr;
11443 long olen, suffixlen;
11444 rb_encoding *enc;
11445
11446 StringValue(suffix);
11447 if (is_broken_string(suffix)) return 0;
11448 enc = rb_enc_check(str, suffix);
11449
11450 /* return 0 if not start with suffix */
11451 suffixlen = RSTRING_LEN(suffix);
11452 if (suffixlen <= 0) return 0;
11453 olen = RSTRING_LEN(str);
11454 if (olen < suffixlen) return 0;
11455 strptr = RSTRING_PTR(str);
11456 suffixptr = RSTRING_PTR(suffix);
11457 const char *strend = strptr + olen;
11458 const char *before_suffix = strend - suffixlen;
11459 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11460 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11461
11462 return suffixlen;
11463}
11464
11465/*
11466 * call-seq:
11467 * delete_suffix!(suffix) -> self or nil
11468 *
11469 * Like String#delete_suffix, except that +self+ is modified in place;
11470 * returns +self+ if the suffix is removed, +nil+ otherwise.
11471 *
11472 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11473 */
11474
11475static VALUE
11476rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11477{
11478 long olen, suffixlen, len;
11479 str_modifiable(str);
11480
11481 suffixlen = deleted_suffix_length(str, suffix);
11482 if (suffixlen <= 0) return Qnil;
11483
11484 olen = RSTRING_LEN(str);
11485 str_modify_keep_cr(str);
11486 len = olen - suffixlen;
11487 STR_SET_LEN(str, len);
11488 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11489 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11491 }
11492 return str;
11493}
11494
11495/*
11496 * call-seq:
11497 * delete_suffix(suffix) -> new_string
11498 *
11499 * :include: doc/string/delete_suffix.rdoc
11500 *
11501 */
11502
11503static VALUE
11504rb_str_delete_suffix(VALUE str, VALUE suffix)
11505{
11506 long suffixlen;
11507
11508 suffixlen = deleted_suffix_length(str, suffix);
11509 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11510
11511 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11512}
11513
11514void
11515rb_str_setter(VALUE val, ID id, VALUE *var)
11516{
11517 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11518 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11519 }
11520 *var = val;
11521}
11522
11523static void
11524nil_setter_warning(ID id)
11525{
11526 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11527}
11528
11529void
11530rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11531{
11532 rb_str_setter(val, id, var);
11533 if (!NIL_P(*var)) {
11534 nil_setter_warning(id);
11535 }
11536}
11537
11538static void
11539rb_fs_setter(VALUE val, ID id, VALUE *var)
11540{
11541 val = rb_fs_check(val);
11542 if (!val) {
11543 rb_raise(rb_eTypeError,
11544 "value of %"PRIsVALUE" must be String or Regexp",
11545 rb_id2str(id));
11546 }
11547 if (!NIL_P(val)) {
11548 nil_setter_warning(id);
11549 }
11550 *var = val;
11551}
11552
11553
11554/*
11555 * call-seq:
11556 * force_encoding(encoding) -> self
11557 *
11558 * :include: doc/string/force_encoding.rdoc
11559 *
11560 */
11561
11562static VALUE
11563rb_str_force_encoding(VALUE str, VALUE enc)
11564{
11565 str_modifiable(str);
11566
11567 rb_encoding *encoding = rb_to_encoding(enc);
11568 int idx = rb_enc_to_index(encoding);
11569
11570 // If the encoding is unchanged, we do nothing.
11571 if (ENCODING_GET(str) == idx) {
11572 return str;
11573 }
11574
11575 rb_enc_associate_index(str, idx);
11576
11577 // If the coderange was 7bit and the new encoding is ASCII-compatible
11578 // we can keep the coderange.
11579 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11580 return str;
11581 }
11582
11584 return str;
11585}
11586
11587/*
11588 * call-seq:
11589 * b -> new_string
11590 *
11591 * :include: doc/string/b.rdoc
11592 *
11593 */
11594
11595static VALUE
11596rb_str_b(VALUE str)
11597{
11598 VALUE str2;
11599 if (STR_EMBED_P(str)) {
11600 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11601 }
11602 else {
11603 str2 = str_alloc_heap(rb_cString);
11604 }
11605 str_replace_shared_without_enc(str2, str);
11606
11607 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11608 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11609 // If we know the receiver's code range then we know the result's code range.
11610 int cr = ENC_CODERANGE(str);
11611 switch (cr) {
11612 case ENC_CODERANGE_7BIT:
11614 break;
11618 break;
11619 default:
11620 ENC_CODERANGE_CLEAR(str2);
11621 break;
11622 }
11623 }
11624
11625 return str2;
11626}
11627
11628/*
11629 * call-seq:
11630 * valid_encoding? -> true or false
11631 *
11632 * :include: doc/string/valid_encoding_p.rdoc
11633 *
11634 */
11635
11636static VALUE
11637rb_str_valid_encoding_p(VALUE str)
11638{
11639 int cr = rb_enc_str_coderange(str);
11640
11641 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11642}
11643
11644/*
11645 * call-seq:
11646 * ascii_only? -> true or false
11647 *
11648 * Returns whether +self+ contains only ASCII characters:
11649 *
11650 * 'abc'.ascii_only? # => true
11651 * "abc\u{6666}".ascii_only? # => false
11652 *
11653 * Related: see {Querying}[rdoc-ref:String@Querying].
11654 */
11655
11656static VALUE
11657rb_str_is_ascii_only_p(VALUE str)
11658{
11659 int cr = rb_enc_str_coderange(str);
11660
11661 return RBOOL(cr == ENC_CODERANGE_7BIT);
11662}
11663
11664VALUE
11666{
11667 static const char ellipsis[] = "...";
11668 const long ellipsislen = sizeof(ellipsis) - 1;
11669 rb_encoding *const enc = rb_enc_get(str);
11670 const long blen = RSTRING_LEN(str);
11671 const char *const p = RSTRING_PTR(str), *e = p + blen;
11672 VALUE estr, ret = 0;
11673
11674 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11675 if (len * rb_enc_mbminlen(enc) >= blen ||
11676 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11677 ret = str;
11678 }
11679 else if (len <= ellipsislen ||
11680 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11681 if (rb_enc_asciicompat(enc)) {
11682 ret = rb_str_new(ellipsis, len);
11683 rb_enc_associate(ret, enc);
11684 }
11685 else {
11686 estr = rb_usascii_str_new(ellipsis, len);
11687 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11688 }
11689 }
11690 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11691 rb_str_cat(ret, ellipsis, ellipsislen);
11692 }
11693 else {
11694 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11695 rb_enc_from_encoding(enc), 0, Qnil);
11696 rb_str_append(ret, estr);
11697 }
11698 return ret;
11699}
11700
11701static VALUE
11702str_compat_and_valid(VALUE str, rb_encoding *enc)
11703{
11704 int cr;
11705 str = StringValue(str);
11706 cr = rb_enc_str_coderange(str);
11707 if (cr == ENC_CODERANGE_BROKEN) {
11708 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11709 }
11710 else {
11711 rb_encoding *e = STR_ENC_GET(str);
11712 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11713 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11714 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11715 }
11716 }
11717 return str;
11718}
11719
11720static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11721
11722VALUE
11724{
11725 rb_encoding *enc = STR_ENC_GET(str);
11726 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11727}
11728
11729VALUE
11730rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11731{
11732 int cr = ENC_CODERANGE_UNKNOWN;
11733 if (enc == STR_ENC_GET(str)) {
11734 /* cached coderange makes sense only when enc equals the
11735 * actual encoding of str */
11736 cr = ENC_CODERANGE(str);
11737 }
11738 return enc_str_scrub(enc, str, repl, cr);
11739}
11740
11741static VALUE
11742enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11743{
11744 int encidx;
11745 VALUE buf = Qnil;
11746 const char *rep, *p, *e, *p1, *sp;
11747 long replen = -1;
11748 long slen;
11749
11750 if (rb_block_given_p()) {
11751 if (!NIL_P(repl))
11752 rb_raise(rb_eArgError, "both of block and replacement given");
11753 replen = 0;
11754 }
11755
11756 if (ENC_CODERANGE_CLEAN_P(cr))
11757 return Qnil;
11758
11759 if (!NIL_P(repl)) {
11760 repl = str_compat_and_valid(repl, enc);
11761 }
11762
11763 if (rb_enc_dummy_p(enc)) {
11764 return Qnil;
11765 }
11766 encidx = rb_enc_to_index(enc);
11767
11768#define DEFAULT_REPLACE_CHAR(str) do { \
11769 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11770 rep = replace; replen = (int)sizeof(replace); \
11771 } while (0)
11772
11773 slen = RSTRING_LEN(str);
11774 p = RSTRING_PTR(str);
11775 e = RSTRING_END(str);
11776 p1 = p;
11777 sp = p;
11778
11779 if (rb_enc_asciicompat(enc)) {
11780 int rep7bit_p;
11781 if (!replen) {
11782 rep = NULL;
11783 rep7bit_p = FALSE;
11784 }
11785 else if (!NIL_P(repl)) {
11786 rep = RSTRING_PTR(repl);
11787 replen = RSTRING_LEN(repl);
11788 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11789 }
11790 else if (encidx == rb_utf8_encindex()) {
11791 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11792 rep7bit_p = FALSE;
11793 }
11794 else {
11795 DEFAULT_REPLACE_CHAR("?");
11796 rep7bit_p = TRUE;
11797 }
11798 cr = ENC_CODERANGE_7BIT;
11799
11800 p = search_nonascii(p, e);
11801 if (!p) {
11802 p = e;
11803 }
11804 while (p < e) {
11805 int ret = rb_enc_precise_mbclen(p, e, enc);
11806 if (MBCLEN_NEEDMORE_P(ret)) {
11807 break;
11808 }
11809 else if (MBCLEN_CHARFOUND_P(ret)) {
11811 p += MBCLEN_CHARFOUND_LEN(ret);
11812 }
11813 else if (MBCLEN_INVALID_P(ret)) {
11814 /*
11815 * p1~p: valid ascii/multibyte chars
11816 * p ~e: invalid bytes + unknown bytes
11817 */
11818 long clen = rb_enc_mbmaxlen(enc);
11819 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11820 if (p > p1) {
11821 rb_str_buf_cat(buf, p1, p - p1);
11822 }
11823
11824 if (e - p < clen) clen = e - p;
11825 if (clen <= 2) {
11826 clen = 1;
11827 }
11828 else {
11829 const char *q = p;
11830 clen--;
11831 for (; clen > 1; clen--) {
11832 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11833 if (MBCLEN_NEEDMORE_P(ret)) break;
11834 if (MBCLEN_INVALID_P(ret)) continue;
11836 }
11837 }
11838 if (rep) {
11839 rb_str_buf_cat(buf, rep, replen);
11840 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11841 }
11842 else {
11843 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11844 str_mod_check(str, sp, slen);
11845 repl = str_compat_and_valid(repl, enc);
11846 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11849 }
11850 p += clen;
11851 p1 = p;
11852 p = search_nonascii(p, e);
11853 if (!p) {
11854 p = e;
11855 break;
11856 }
11857 }
11858 else {
11860 }
11861 }
11862 if (NIL_P(buf)) {
11863 if (p == e) {
11864 ENC_CODERANGE_SET(str, cr);
11865 return Qnil;
11866 }
11867 buf = rb_str_buf_new(RSTRING_LEN(str));
11868 }
11869 if (p1 < p) {
11870 rb_str_buf_cat(buf, p1, p - p1);
11871 }
11872 if (p < e) {
11873 if (rep) {
11874 rb_str_buf_cat(buf, rep, replen);
11875 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11876 }
11877 else {
11878 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11879 str_mod_check(str, sp, slen);
11880 repl = str_compat_and_valid(repl, enc);
11881 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11884 }
11885 }
11886 }
11887 else {
11888 /* ASCII incompatible */
11889 long mbminlen = rb_enc_mbminlen(enc);
11890 if (!replen) {
11891 rep = NULL;
11892 }
11893 else if (!NIL_P(repl)) {
11894 rep = RSTRING_PTR(repl);
11895 replen = RSTRING_LEN(repl);
11896 }
11897 else if (encidx == ENCINDEX_UTF_16BE) {
11898 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11899 }
11900 else if (encidx == ENCINDEX_UTF_16LE) {
11901 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11902 }
11903 else if (encidx == ENCINDEX_UTF_32BE) {
11904 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11905 }
11906 else if (encidx == ENCINDEX_UTF_32LE) {
11907 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11908 }
11909 else {
11910 DEFAULT_REPLACE_CHAR("?");
11911 }
11912
11913 while (p < e) {
11914 int ret = rb_enc_precise_mbclen(p, e, enc);
11915 if (MBCLEN_NEEDMORE_P(ret)) {
11916 break;
11917 }
11918 else if (MBCLEN_CHARFOUND_P(ret)) {
11919 p += MBCLEN_CHARFOUND_LEN(ret);
11920 }
11921 else if (MBCLEN_INVALID_P(ret)) {
11922 const char *q = p;
11923 long clen = rb_enc_mbmaxlen(enc);
11924 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11925 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11926
11927 if (e - p < clen) clen = e - p;
11928 if (clen <= mbminlen * 2) {
11929 clen = mbminlen;
11930 }
11931 else {
11932 clen -= mbminlen;
11933 for (; clen > mbminlen; clen-=mbminlen) {
11934 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11935 if (MBCLEN_NEEDMORE_P(ret)) break;
11936 if (MBCLEN_INVALID_P(ret)) continue;
11938 }
11939 }
11940 if (rep) {
11941 rb_str_buf_cat(buf, rep, replen);
11942 }
11943 else {
11944 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11945 str_mod_check(str, sp, slen);
11946 repl = str_compat_and_valid(repl, enc);
11947 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11948 }
11949 p += clen;
11950 p1 = p;
11951 }
11952 else {
11954 }
11955 }
11956 if (NIL_P(buf)) {
11957 if (p == e) {
11959 return Qnil;
11960 }
11961 buf = rb_str_buf_new(RSTRING_LEN(str));
11962 }
11963 if (p1 < p) {
11964 rb_str_buf_cat(buf, p1, p - p1);
11965 }
11966 if (p < e) {
11967 if (rep) {
11968 rb_str_buf_cat(buf, rep, replen);
11969 }
11970 else {
11971 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11972 str_mod_check(str, sp, slen);
11973 repl = str_compat_and_valid(repl, enc);
11974 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11975 }
11976 }
11978 }
11979 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11980 return buf;
11981}
11982
11983/*
11984 * call-seq:
11985 * scrub(replacement_string = default_replacement_string) -> new_string
11986 * scrub{|sequence| ... } -> new_string
11987 *
11988 * :include: doc/string/scrub.rdoc
11989 *
11990 */
11991static VALUE
11992str_scrub(int argc, VALUE *argv, VALUE str)
11993{
11994 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11995 VALUE new = rb_str_scrub(str, repl);
11996 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11997}
11998
11999/*
12000 * call-seq:
12001 * scrub!(replacement_string = default_replacement_string) -> self
12002 * scrub!{|sequence| ... } -> self
12003 *
12004 * Like String#scrub, except that:
12005 *
12006 * - Any replacements are made in +self+.
12007 * - Returns +self+.
12008 *
12009 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12010 *
12011 */
12012static VALUE
12013str_scrub_bang(int argc, VALUE *argv, VALUE str)
12014{
12015 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12016 VALUE new = rb_str_scrub(str, repl);
12017 if (!NIL_P(new)) rb_str_replace(str, new);
12018 return str;
12019}
12020
12021static ID id_normalize;
12022static ID id_normalized_p;
12023static VALUE mUnicodeNormalize;
12024
12025static VALUE
12026unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12027{
12028 static int UnicodeNormalizeRequired = 0;
12029 VALUE argv2[2];
12030
12031 if (!UnicodeNormalizeRequired) {
12032 rb_require("unicode_normalize/normalize.rb");
12033 UnicodeNormalizeRequired = 1;
12034 }
12035 argv2[0] = str;
12036 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12037 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12038}
12039
12040/*
12041 * call-seq:
12042 * unicode_normalize(form = :nfc) -> string
12043 *
12044 * :include: doc/string/unicode_normalize.rdoc
12045 *
12046 */
12047static VALUE
12048rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12049{
12050 return unicode_normalize_common(argc, argv, str, id_normalize);
12051}
12052
12053/*
12054 * call-seq:
12055 * unicode_normalize!(form = :nfc) -> self
12056 *
12057 * Like String#unicode_normalize, except that the normalization
12058 * is performed on +self+ (not on a copy of +self+).
12059 *
12060 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12061 *
12062 */
12063static VALUE
12064rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12065{
12066 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12067}
12068
12069/* call-seq:
12070 * unicode_normalized?(form = :nfc) -> true or false
12071 *
12072 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12073 * see String#unicode_normalize.
12074 *
12075 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12076 *
12077 * Examples:
12078 *
12079 * "a\u0300".unicode_normalized? # => false
12080 * "a\u0300".unicode_normalized?(:nfd) # => true
12081 * "\u00E0".unicode_normalized? # => true
12082 * "\u00E0".unicode_normalized?(:nfd) # => false
12083 *
12084 *
12085 * Raises an exception if +self+ is not in a Unicode encoding:
12086 *
12087 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12088 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12089 *
12090 * Related: see {Querying}[rdoc-ref:String@Querying].
12091 */
12092static VALUE
12093rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12094{
12095 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12096}
12097
12098/**********************************************************************
12099 * Document-class: Symbol
12100 *
12101 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12102 *
12103 * You can create a +Symbol+ object explicitly with:
12104 *
12105 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12106 *
12107 * The same +Symbol+ object will be
12108 * created for a given name or string for the duration of a program's
12109 * execution, regardless of the context or meaning of that name. Thus
12110 * if <code>Fred</code> is a constant in one context, a method in
12111 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12112 * will be the same object in all three contexts.
12113 *
12114 * module One
12115 * class Fred
12116 * end
12117 * $f1 = :Fred
12118 * end
12119 * module Two
12120 * Fred = 1
12121 * $f2 = :Fred
12122 * end
12123 * def Fred()
12124 * end
12125 * $f3 = :Fred
12126 * $f1.object_id #=> 2514190
12127 * $f2.object_id #=> 2514190
12128 * $f3.object_id #=> 2514190
12129 *
12130 * Constant, method, and variable names are returned as symbols:
12131 *
12132 * module One
12133 * Two = 2
12134 * def three; 3 end
12135 * @four = 4
12136 * @@five = 5
12137 * $six = 6
12138 * end
12139 * seven = 7
12140 *
12141 * One.constants
12142 * # => [:Two]
12143 * One.instance_methods(true)
12144 * # => [:three]
12145 * One.instance_variables
12146 * # => [:@four]
12147 * One.class_variables
12148 * # => [:@@five]
12149 * global_variables.grep(/six/)
12150 * # => [:$six]
12151 * local_variables
12152 * # => [:seven]
12153 *
12154 * A +Symbol+ object differs from a String object in that
12155 * a +Symbol+ object represents an identifier, while a String object
12156 * represents text or data.
12157 *
12158 * == What's Here
12159 *
12160 * First, what's elsewhere. Class +Symbol+:
12161 *
12162 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12163 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12164 *
12165 * Here, class +Symbol+ provides methods that are useful for:
12166 *
12167 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12168 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12169 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12170 *
12171 * === Methods for Querying
12172 *
12173 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12174 * - #=~: Returns the index of the first substring in symbol that matches a
12175 * given Regexp or other object; returns +nil+ if no match is found.
12176 * - #[], #slice : Returns a substring of symbol
12177 * determined by a given index, start/length, or range, or string.
12178 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12179 * - #encoding: Returns the Encoding object that represents the encoding
12180 * of symbol.
12181 * - #end_with?: Returns +true+ if symbol ends with
12182 * any of the given strings.
12183 * - #match: Returns a MatchData object if symbol
12184 * matches a given Regexp; +nil+ otherwise.
12185 * - #match?: Returns +true+ if symbol
12186 * matches a given Regexp; +false+ otherwise.
12187 * - #length, #size: Returns the number of characters in symbol.
12188 * - #start_with?: Returns +true+ if symbol starts with
12189 * any of the given strings.
12190 *
12191 * === Methods for Comparing
12192 *
12193 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12194 * or larger than symbol.
12195 * - #==, #===: Returns +true+ if a given symbol has the same content and
12196 * encoding.
12197 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12198 * symbol is smaller than, equal to, or larger than symbol.
12199 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12200 * after Unicode case folding; +false+ otherwise.
12201 *
12202 * === Methods for Converting
12203 *
12204 * - #capitalize: Returns symbol with the first character upcased
12205 * and all other characters downcased.
12206 * - #downcase: Returns symbol with all characters downcased.
12207 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12208 * - #name: Returns the frozen string corresponding to symbol.
12209 * - #succ, #next: Returns the symbol that is the successor to symbol.
12210 * - #swapcase: Returns symbol with all upcase characters downcased
12211 * and all downcase characters upcased.
12212 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12213 * - #to_s, #id2name: Returns the string corresponding to +self+.
12214 * - #to_sym, #intern: Returns +self+.
12215 * - #upcase: Returns symbol with all characters upcased.
12216 *
12217 */
12218
12219
12220/*
12221 * call-seq:
12222 * symbol == object -> true or false
12223 *
12224 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12225 */
12226
12227#define sym_equal rb_obj_equal
12228
12229static int
12230sym_printable(const char *s, const char *send, rb_encoding *enc)
12231{
12232 while (s < send) {
12233 int n;
12234 int c = rb_enc_precise_mbclen(s, send, enc);
12235
12236 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12237 n = MBCLEN_CHARFOUND_LEN(c);
12238 c = rb_enc_mbc_to_codepoint(s, send, enc);
12239 if (!rb_enc_isprint(c, enc)) return FALSE;
12240 s += n;
12241 }
12242 return TRUE;
12243}
12244
12245int
12246rb_str_symname_p(VALUE sym)
12247{
12248 rb_encoding *enc;
12249 const char *ptr;
12250 long len;
12251 rb_encoding *resenc = rb_default_internal_encoding();
12252
12253 if (resenc == NULL) resenc = rb_default_external_encoding();
12254 enc = STR_ENC_GET(sym);
12255 ptr = RSTRING_PTR(sym);
12256 len = RSTRING_LEN(sym);
12257 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12258 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12259 return FALSE;
12260 }
12261 return TRUE;
12262}
12263
12264VALUE
12265rb_str_quote_unprintable(VALUE str)
12266{
12267 rb_encoding *enc;
12268 const char *ptr;
12269 long len;
12270 rb_encoding *resenc;
12271
12272 Check_Type(str, T_STRING);
12274 if (resenc == NULL) resenc = rb_default_external_encoding();
12275 enc = STR_ENC_GET(str);
12276 ptr = RSTRING_PTR(str);
12277 len = RSTRING_LEN(str);
12278 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12279 !sym_printable(ptr, ptr + len, enc)) {
12280 return rb_str_escape(str);
12281 }
12282 return str;
12283}
12284
12285VALUE
12286rb_id_quote_unprintable(ID id)
12287{
12288 VALUE str = rb_id2str(id);
12289 if (!rb_str_symname_p(str)) {
12290 return rb_str_escape(str);
12291 }
12292 return str;
12293}
12294
12295/*
12296 * call-seq:
12297 * inspect -> string
12298 *
12299 * Returns a string representation of +self+ (including the leading colon):
12300 *
12301 * :foo.inspect # => ":foo"
12302 *
12303 * Related: Symbol#to_s, Symbol#name.
12304 *
12305 */
12306
12307static VALUE
12308sym_inspect(VALUE sym)
12309{
12310 VALUE str = rb_sym2str(sym);
12311 const char *ptr;
12312 long len;
12313 char *dest;
12314
12315 if (!rb_str_symname_p(str)) {
12316 str = rb_str_inspect(str);
12317 len = RSTRING_LEN(str);
12318 rb_str_resize(str, len + 1);
12319 dest = RSTRING_PTR(str);
12320 memmove(dest + 1, dest, len);
12321 }
12322 else {
12323 rb_encoding *enc = STR_ENC_GET(str);
12324 VALUE orig_str = str;
12325
12326 len = RSTRING_LEN(orig_str);
12327 str = rb_enc_str_new(0, len + 1, enc);
12328
12329 // Get data pointer after allocation
12330 ptr = RSTRING_PTR(orig_str);
12331 dest = RSTRING_PTR(str);
12332 memcpy(dest + 1, ptr, len);
12333
12334 RB_GC_GUARD(orig_str);
12335 }
12336 dest[0] = ':';
12337
12339
12340 return str;
12341}
12342
12343VALUE
12345{
12346 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12347 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12348 return str;
12349}
12350
12351VALUE
12352rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12353{
12354 VALUE obj;
12355
12356 if (argc < 1) {
12357 rb_raise(rb_eArgError, "no receiver given");
12358 }
12359 obj = argv[0];
12360 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12361}
12362
12363/*
12364 * call-seq:
12365 * succ
12366 *
12367 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12368 *
12369 * :foo.succ # => :fop
12370 *
12371 * Related: String#succ.
12372 */
12373
12374static VALUE
12375sym_succ(VALUE sym)
12376{
12377 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12378}
12379
12380/*
12381 * call-seq:
12382 * self <=> other -> -1, 0, 1, or nil
12383 *
12384 * Compares +self+ and +other+, using String#<=>.
12385 *
12386 * Returns:
12387 *
12388 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12389 * - +nil+, otherwise.
12390 *
12391 * Examples:
12392 *
12393 * :bar <=> :foo # => -1
12394 * :foo <=> :foo # => 0
12395 * :foo <=> :bar # => 1
12396 * :foo <=> 'bar' # => nil
12397 *
12398 * \Class \Symbol includes module Comparable,
12399 * each of whose methods uses Symbol#<=> for comparison.
12400 *
12401 * Related: String#<=>.
12402 */
12403
12404static VALUE
12405sym_cmp(VALUE sym, VALUE other)
12406{
12407 if (!SYMBOL_P(other)) {
12408 return Qnil;
12409 }
12410 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12411}
12412
12413/*
12414 * call-seq:
12415 * casecmp(object) -> -1, 0, 1, or nil
12416 *
12417 * :include: doc/symbol/casecmp.rdoc
12418 *
12419 */
12420
12421static VALUE
12422sym_casecmp(VALUE sym, VALUE other)
12423{
12424 if (!SYMBOL_P(other)) {
12425 return Qnil;
12426 }
12427 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12428}
12429
12430/*
12431 * call-seq:
12432 * casecmp?(object) -> true, false, or nil
12433 *
12434 * :include: doc/symbol/casecmp_p.rdoc
12435 *
12436 */
12437
12438static VALUE
12439sym_casecmp_p(VALUE sym, VALUE other)
12440{
12441 if (!SYMBOL_P(other)) {
12442 return Qnil;
12443 }
12444 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12445}
12446
12447/*
12448 * call-seq:
12449 * symbol =~ object -> integer or nil
12450 *
12451 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12452 * including possible updates to global variables;
12453 * see String#=~.
12454 *
12455 */
12456
12457static VALUE
12458sym_match(VALUE sym, VALUE other)
12459{
12460 return rb_str_match(rb_sym2str(sym), other);
12461}
12462
12463/*
12464 * call-seq:
12465 * match(pattern, offset = 0) -> matchdata or nil
12466 * match(pattern, offset = 0) {|matchdata| } -> object
12467 *
12468 * Equivalent to <tt>self.to_s.match</tt>,
12469 * including possible updates to global variables;
12470 * see String#match.
12471 *
12472 */
12473
12474static VALUE
12475sym_match_m(int argc, VALUE *argv, VALUE sym)
12476{
12477 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12478}
12479
12480/*
12481 * call-seq:
12482 * match?(pattern, offset) -> true or false
12483 *
12484 * Equivalent to <tt>sym.to_s.match?</tt>;
12485 * see String#match.
12486 *
12487 */
12488
12489static VALUE
12490sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12491{
12492 return rb_str_match_m_p(argc, argv, sym);
12493}
12494
12495/*
12496 * call-seq:
12497 * symbol[index] -> string or nil
12498 * symbol[start, length] -> string or nil
12499 * symbol[range] -> string or nil
12500 * symbol[regexp, capture = 0] -> string or nil
12501 * symbol[substring] -> string or nil
12502 *
12503 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12504 *
12505 */
12506
12507static VALUE
12508sym_aref(int argc, VALUE *argv, VALUE sym)
12509{
12510 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12511}
12512
12513/*
12514 * call-seq:
12515 * length -> integer
12516 *
12517 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12518 */
12519
12520static VALUE
12521sym_length(VALUE sym)
12522{
12523 return rb_str_length(rb_sym2str(sym));
12524}
12525
12526/*
12527 * call-seq:
12528 * empty? -> true or false
12529 *
12530 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12531 *
12532 */
12533
12534static VALUE
12535sym_empty(VALUE sym)
12536{
12537 return rb_str_empty(rb_sym2str(sym));
12538}
12539
12540/*
12541 * call-seq:
12542 * upcase(mapping) -> symbol
12543 *
12544 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12545 *
12546 * See String#upcase.
12547 *
12548 */
12549
12550static VALUE
12551sym_upcase(int argc, VALUE *argv, VALUE sym)
12552{
12553 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12554}
12555
12556/*
12557 * call-seq:
12558 * downcase(mapping) -> symbol
12559 *
12560 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12561 *
12562 * See String#downcase.
12563 *
12564 * Related: Symbol#upcase.
12565 *
12566 */
12567
12568static VALUE
12569sym_downcase(int argc, VALUE *argv, VALUE sym)
12570{
12571 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12572}
12573
12574/*
12575 * call-seq:
12576 * capitalize(mapping) -> symbol
12577 *
12578 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12579 *
12580 * See String#capitalize.
12581 *
12582 */
12583
12584static VALUE
12585sym_capitalize(int argc, VALUE *argv, VALUE sym)
12586{
12587 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12588}
12589
12590/*
12591 * call-seq:
12592 * swapcase(mapping) -> symbol
12593 *
12594 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12595 *
12596 * See String#swapcase.
12597 *
12598 */
12599
12600static VALUE
12601sym_swapcase(int argc, VALUE *argv, VALUE sym)
12602{
12603 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12604}
12605
12606/*
12607 * call-seq:
12608 * start_with?(*string_or_regexp) -> true or false
12609 *
12610 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12611 *
12612 */
12613
12614static VALUE
12615sym_start_with(int argc, VALUE *argv, VALUE sym)
12616{
12617 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12618}
12619
12620/*
12621 * call-seq:
12622 * end_with?(*strings) -> true or false
12623 *
12624 *
12625 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12626 *
12627 */
12628
12629static VALUE
12630sym_end_with(int argc, VALUE *argv, VALUE sym)
12631{
12632 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12633}
12634
12635/*
12636 * call-seq:
12637 * encoding -> encoding
12638 *
12639 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12640 *
12641 */
12642
12643static VALUE
12644sym_encoding(VALUE sym)
12645{
12646 return rb_obj_encoding(rb_sym2str(sym));
12647}
12648
12649static VALUE
12650string_for_symbol(VALUE name)
12651{
12652 if (!RB_TYPE_P(name, T_STRING)) {
12653 VALUE tmp = rb_check_string_type(name);
12654 if (NIL_P(tmp)) {
12655 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12656 name);
12657 }
12658 name = tmp;
12659 }
12660 return name;
12661}
12662
12663ID
12665{
12666 if (SYMBOL_P(name)) {
12667 return SYM2ID(name);
12668 }
12669 name = string_for_symbol(name);
12670 return rb_intern_str(name);
12671}
12672
12673VALUE
12675{
12676 if (SYMBOL_P(name)) {
12677 return name;
12678 }
12679 name = string_for_symbol(name);
12680 return rb_str_intern(name);
12681}
12682
12683/*
12684 * call-seq:
12685 * Symbol.all_symbols -> array_of_symbols
12686 *
12687 * Returns an array of all symbols currently in Ruby's symbol table:
12688 *
12689 * Symbol.all_symbols.size # => 9334
12690 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12691 *
12692 */
12693
12694static VALUE
12695sym_all_symbols(VALUE _)
12696{
12697 return rb_sym_all_symbols();
12698}
12699
12700VALUE
12701rb_str_to_interned_str(VALUE str)
12702{
12703 return rb_fstring(str);
12704}
12705
12706VALUE
12707rb_interned_str(const char *ptr, long len)
12708{
12709 struct RString fake_str = {RBASIC_INIT};
12710 int encidx = ENCINDEX_US_ASCII;
12711 int coderange = ENC_CODERANGE_7BIT;
12712 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12713 encidx = ENCINDEX_ASCII_8BIT;
12714 coderange = ENC_CODERANGE_VALID;
12715 }
12716 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12717 ENC_CODERANGE_SET(str, coderange);
12718 return register_fstring(str, true, false);
12719}
12720
12721VALUE
12722rb_interned_str_cstr(const char *ptr)
12723{
12724 return rb_interned_str(ptr, strlen(ptr));
12725}
12726
12727VALUE
12728rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12729{
12730 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12731 rb_enc_autoload(enc);
12732 }
12733
12734 struct RString fake_str = {RBASIC_INIT};
12735 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12736}
12737
12738VALUE
12739rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12740{
12741 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12742 rb_enc_autoload(enc);
12743 }
12744
12745 struct RString fake_str = {RBASIC_INIT};
12746 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12747 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12748 return str;
12749}
12750
12751VALUE
12752rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12753{
12754 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12755}
12756
12757#if USE_YJIT || USE_ZJIT
12758void
12759rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12760{
12761 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12762 ssize_t code = RB_NUM2SSIZE(codepoint);
12763
12764 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12765 rb_str_buf_cat_byte(str, (char) code);
12766 return;
12767 }
12768 }
12769
12770 rb_str_concat(str, codepoint);
12771}
12772#endif
12773
12774static int
12775fstring_set_class_i(VALUE *str, void *data)
12776{
12777 RBASIC_SET_CLASS(*str, rb_cString);
12778
12779 return ST_CONTINUE;
12780}
12781
12782void
12783Init_String(void)
12784{
12785 rb_cString = rb_define_class("String", rb_cObject);
12786
12787 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12788
12790 rb_define_alloc_func(rb_cString, empty_str_alloc);
12791 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12792 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12793 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12795 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12796 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12799 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12800 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12801 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12802 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12805 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12806 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12807 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12808 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12811 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12812 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12813 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12814 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12815 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12817 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12819 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12820 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12821 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12822 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12823 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12824 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12825 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12826 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12827 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12828 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12829 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12830 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12831 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12832 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12834 rb_define_method(rb_cString, "+@", str_uplus, 0);
12835 rb_define_method(rb_cString, "-@", str_uminus, 0);
12836 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12837 rb_define_alias(rb_cString, "dedup", "-@");
12838
12839 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12840 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12841 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12842 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12845 rb_define_method(rb_cString, "undump", str_undump, 0);
12846
12847 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12848 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12849 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12850 sym_fold = ID2SYM(rb_intern_const("fold"));
12851
12852 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12853 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12854 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12855 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12856
12857 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12858 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12859 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12860 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12861
12862 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12863 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12864 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12865 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12866 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12867 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12868 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12869 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12870 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12871 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12872 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12873 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12875 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12876 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12877 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12878 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12879 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12880
12881 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12882 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12883 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12884
12885 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12886
12887 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12888 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12889 rb_define_method(rb_cString, "center", rb_str_center, -1);
12890
12891 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12892 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12893 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12894 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12895 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12896 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12897 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12898 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12899 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12900
12901 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12902 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12903 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12904 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12905 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12906 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12907 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12908 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12909 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12910
12911 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12912 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12913 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12914 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12915 rb_define_method(rb_cString, "count", rb_str_count, -1);
12916
12917 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12918 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12919 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12920 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12921
12922 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12923 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12924 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12925 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12926 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12927
12928 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12929
12930 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12931 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12932
12933 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12934 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12935
12936 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12937 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12938 rb_define_method(rb_cString, "b", rb_str_b, 0);
12939 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12940 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12941
12942 /* define UnicodeNormalize module here so that we don't have to look it up */
12943 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12944 id_normalize = rb_intern_const("normalize");
12945 id_normalized_p = rb_intern_const("normalized?");
12946
12947 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12948 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12949 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12950
12951 rb_fs = Qnil;
12952 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12953 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12954 rb_gc_register_address(&rb_fs);
12955
12956 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12960 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12961
12962 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12963 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12964 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12965 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12966 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12967 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12968
12969 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12970 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12971 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12972 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12973
12974 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12975 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12976 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12977 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12978 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12979 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12980 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12981
12982 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12983 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12984 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12985 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12986
12987 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12988 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12989
12990 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12991}
12992
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
#define ISSPACE
@old{rb_isspace}
Definition ctype.h:88
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define ISDIGIT
@old{rb_isdigit}
Definition ctype.h:93
#define ISALPHA
@old{rb_isalpha}
Definition ctype.h:92
#define TOLOWER
@old{rb_tolower}
Definition ctype.h:101
#define ISPRINT
@old{rb_isprint}
Definition ctype.h:86
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1685
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1478
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1591
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2655
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1010
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:653
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3909
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1435
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1431
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1438
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1429
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1433
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2208
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2226
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3622
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3306
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition encoding.c:1535
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition encoding.c:1523
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition encoding.c:1598
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition encoding.c:1743
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition encoding.c:1541
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition encoding.c:1529
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition encoding.c:1656
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition encoding.c:1586
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition encoding.c:1547
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition encoding.c:1553
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1342
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:947
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1207
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3030
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1226
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12728
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2334
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3734
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1155
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1447
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1348
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:966
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12752
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:831
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition hash.c:1464
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:714
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1060
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1231
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4223
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3720
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1485
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1922
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1752
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1512
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2487
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3799
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1423
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12344
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2560
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1399
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1746
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3058
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5341
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4162
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3155
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11665
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1782
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1788
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1189
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1001
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1518
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1996
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4148
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3567
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2423
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2014
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6548
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3163
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12722
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1429
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3765
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3105
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4269
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3389
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7227
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2792
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12707
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4216
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4036
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4191
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3741
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3280
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5825
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11723
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1702
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2952
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3252
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3371
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1201
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2746
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7334
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1411
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1718
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2437
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5743
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9341
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1195
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:937
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1850
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2030
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2109
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3416
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1664
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12674
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12664
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1861
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3499
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4467
int ruby_thread_has_gvl_p(void)
Whether the current thread is holding the GVL.
Definition thread.c:2103
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1441
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2929
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2811
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1435
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2824
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1779
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:381
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:461
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:205
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:8221
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113